commit 7409e082e41a0bee5da1b2a029f1500bb5c24f25 Author: ModelHub XC Date: Wed May 27 18:20:39 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_sciencefisher_v00.12 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..be4b57e --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/MoT_science_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_sciencefisher_v00.12 +tags: +- generated_from_trainer +- trl +- sft +- open-r1 +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_sciencefisher_v00.12 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/MoT_science_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/MoT_science_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_sciencefisher_v00.12", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/vhlno3k3) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.0.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..7953218 --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 2.2411346435546875e-05, + "total_flos": 3.1829322264618205e+19, + "train_loss": 0.9959457731411026, + "train_runtime": 37043.5107, + "train_samples": 145693, + "train_samples_per_second": 11.799, + "train_steps_per_second": 0.737 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..19e9f99 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:355821a2b19ec87cd652d2c168c25bc9faed72056c183e4583efa65ba8a09f31 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..0fb6f71 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c37b39786a5b23afbe3ad952c5766f390533fa4d1a093e661ea172d2aec4e3ec +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..fffd761 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05194ddae58920b39a0b466458340fe1e644b8f3f03be9418dc1e24832f1a806 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..96b0ec7 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07efadccdaeab6cfea8712f53eff9e87f1e9f2ae83f933199c5411e933f55ec7 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..7953218 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 2.2411346435546875e-05, + "total_flos": 3.1829322264618205e+19, + "train_loss": 0.9959457731411026, + "train_runtime": 37043.5107, + "train_samples": 145693, + "train_samples_per_second": 11.799, + "train_steps_per_second": 0.737 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..b854f53 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,273224 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 27318, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010981770261366133, + "ewc_loss": 0.0, + "grad_norm": 6.514667510986328, + "learning_rate": 0.0, + "loss": 1.5235, + "mean_token_accuracy": 0.6157128810882568, + "num_tokens": 30018.0, + "step": 1 + }, + { + "epoch": 0.00021963540522732265, + "ewc_loss": 0.0, + "grad_norm": 6.4087958335876465, + "learning_rate": 3.6603221083455343e-10, + "loss": 1.4062, + "mean_token_accuracy": 0.628243088722229, + "num_tokens": 58744.0, + "step": 2 + }, + { + "epoch": 0.000329453107840984, + "ewc_loss": 5.488773498207866e-19, + "grad_norm": 7.284616947174072, + "learning_rate": 7.320644216691069e-10, + "loss": 1.4743, + "mean_token_accuracy": 0.6152434945106506, + "num_tokens": 84293.0, + "step": 3 + }, + { + "epoch": 0.0004392708104546453, + "ewc_loss": 2.5912431922403556e-17, + "grad_norm": 7.035748481750488, + "learning_rate": 1.0980966325036603e-09, + "loss": 1.5504, + "mean_token_accuracy": 0.5984039306640625, + "num_tokens": 113639.0, + "step": 4 + }, + { + "epoch": 0.0005490885130683066, + "ewc_loss": 5.93275428784068e-16, + "grad_norm": 7.090041637420654, + "learning_rate": 1.4641288433382137e-09, + "loss": 1.5152, + "mean_token_accuracy": 0.6071116924285889, + "num_tokens": 139615.0, + "step": 5 + }, + { + "epoch": 0.000658906215681968, + "ewc_loss": 7.008282842946301e-16, + "grad_norm": 6.85969877243042, + "learning_rate": 1.8301610541727671e-09, + "loss": 1.6124, + "mean_token_accuracy": 0.581271231174469, + "num_tokens": 168855.0, + "step": 6 + }, + { + "epoch": 0.0007687239182956292, + "ewc_loss": 2.192690473634684e-15, + "grad_norm": 7.6452250480651855, + "learning_rate": 2.1961932650073206e-09, + "loss": 1.5225, + "mean_token_accuracy": 0.603061318397522, + "num_tokens": 192650.0, + "step": 7 + }, + { + "epoch": 0.0008785416209092906, + "ewc_loss": 1.0935696792557792e-14, + "grad_norm": 6.315532207489014, + "learning_rate": 2.562225475841874e-09, + "loss": 1.5187, + "mean_token_accuracy": 0.6046382188796997, + "num_tokens": 225812.0, + "step": 8 + }, + { + "epoch": 0.000988359323522952, + "ewc_loss": 1.9206858326015208e-14, + "grad_norm": 8.882826805114746, + "learning_rate": 2.9282576866764274e-09, + "loss": 1.5177, + "mean_token_accuracy": 0.6052026748657227, + "num_tokens": 245223.0, + "step": 9 + }, + { + "epoch": 0.0010981770261366132, + "ewc_loss": 2.3869795029440866e-14, + "grad_norm": 6.882201194763184, + "learning_rate": 3.294289897510981e-09, + "loss": 1.566, + "mean_token_accuracy": 0.5966366529464722, + "num_tokens": 273619.0, + "step": 10 + }, + { + "epoch": 0.0012079947287502745, + "ewc_loss": 7.327471962526033e-14, + "grad_norm": 7.191747188568115, + "learning_rate": 3.6603221083455343e-09, + "loss": 1.3783, + "mean_token_accuracy": 0.6423099637031555, + "num_tokens": 297418.0, + "step": 11 + }, + { + "epoch": 0.001317812431363936, + "ewc_loss": 9.903189379656396e-14, + "grad_norm": 6.077397346496582, + "learning_rate": 4.026354319180088e-09, + "loss": 1.3927, + "mean_token_accuracy": 0.6353145837783813, + "num_tokens": 328966.0, + "step": 12 + }, + { + "epoch": 0.0014276301339775973, + "ewc_loss": 1.341149413747189e-13, + "grad_norm": 7.558345317840576, + "learning_rate": 4.392386530014641e-09, + "loss": 1.3895, + "mean_token_accuracy": 0.6390926241874695, + "num_tokens": 353003.0, + "step": 13 + }, + { + "epoch": 0.0015374478365912585, + "ewc_loss": 1.6431300764452317e-13, + "grad_norm": 7.24896240234375, + "learning_rate": 4.758418740849194e-09, + "loss": 1.4651, + "mean_token_accuracy": 0.6210266351699829, + "num_tokens": 377957.0, + "step": 14 + }, + { + "epoch": 0.0016472655392049198, + "ewc_loss": 5.968558980384842e-13, + "grad_norm": 8.104095458984375, + "learning_rate": 5.124450951683748e-09, + "loss": 1.4739, + "mean_token_accuracy": 0.6141138076782227, + "num_tokens": 398957.0, + "step": 15 + }, + { + "epoch": 0.0017570832418185812, + "ewc_loss": 7.318590178329032e-13, + "grad_norm": 7.218289375305176, + "learning_rate": 5.490483162518302e-09, + "loss": 1.5629, + "mean_token_accuracy": 0.5935758352279663, + "num_tokens": 424867.0, + "step": 16 + }, + { + "epoch": 0.0018669009444322424, + "ewc_loss": 8.277822871605167e-13, + "grad_norm": 6.992076873779297, + "learning_rate": 5.856515373352855e-09, + "loss": 1.4465, + "mean_token_accuracy": 0.6199201345443726, + "num_tokens": 450753.0, + "step": 17 + }, + { + "epoch": 0.001976718647045904, + "ewc_loss": 1.0089706847793423e-12, + "grad_norm": 6.825626850128174, + "learning_rate": 6.222547584187408e-09, + "loss": 1.4526, + "mean_token_accuracy": 0.6167707443237305, + "num_tokens": 478721.0, + "step": 18 + }, + { + "epoch": 0.002086536349659565, + "ewc_loss": 1.1510792319313623e-12, + "grad_norm": 8.605110168457031, + "learning_rate": 6.588579795021962e-09, + "loss": 1.5915, + "mean_token_accuracy": 0.5919297933578491, + "num_tokens": 499326.0, + "step": 19 + }, + { + "epoch": 0.0021963540522732263, + "ewc_loss": 3.495870259939693e-12, + "grad_norm": 7.642739295959473, + "learning_rate": 6.954612005856515e-09, + "loss": 1.4422, + "mean_token_accuracy": 0.6243001222610474, + "num_tokens": 522720.0, + "step": 20 + }, + { + "epoch": 0.002306171754886888, + "ewc_loss": 4.945377440890297e-12, + "grad_norm": 6.30642032623291, + "learning_rate": 7.3206442166910685e-09, + "loss": 1.4214, + "mean_token_accuracy": 0.6322819590568542, + "num_tokens": 551422.0, + "step": 21 + }, + { + "epoch": 0.002415989457500549, + "ewc_loss": 6.0254023992456496e-12, + "grad_norm": 8.075174331665039, + "learning_rate": 7.686676427525622e-09, + "loss": 1.5616, + "mean_token_accuracy": 0.6011888980865479, + "num_tokens": 572912.0, + "step": 22 + }, + { + "epoch": 0.0025258071601142102, + "ewc_loss": 6.480149750132114e-12, + "grad_norm": 5.991759777069092, + "learning_rate": 8.052708638360176e-09, + "loss": 1.4109, + "mean_token_accuracy": 0.6286705732345581, + "num_tokens": 604172.0, + "step": 23 + }, + { + "epoch": 0.002635624862727872, + "ewc_loss": 7.275957614183426e-12, + "grad_norm": 7.518413543701172, + "learning_rate": 8.418740849194728e-09, + "loss": 1.4469, + "mean_token_accuracy": 0.6205610632896423, + "num_tokens": 627461.0, + "step": 24 + }, + { + "epoch": 0.002745442565341533, + "ewc_loss": 8.071765478234738e-12, + "grad_norm": 6.361715793609619, + "learning_rate": 8.784773060029282e-09, + "loss": 1.448, + "mean_token_accuracy": 0.627144455909729, + "num_tokens": 657516.0, + "step": 25 + }, + { + "epoch": 0.0028552602679551946, + "ewc_loss": 8.810729923425242e-12, + "grad_norm": 6.295804500579834, + "learning_rate": 9.150805270863836e-09, + "loss": 1.5435, + "mean_token_accuracy": 0.5953788757324219, + "num_tokens": 691937.0, + "step": 26 + }, + { + "epoch": 0.0029650779705688557, + "ewc_loss": 1.3415046851150692e-11, + "grad_norm": 8.503387451171875, + "learning_rate": 9.516837481698389e-09, + "loss": 1.4646, + "mean_token_accuracy": 0.626865565776825, + "num_tokens": 711900.0, + "step": 27 + }, + { + "epoch": 0.003074895673182517, + "ewc_loss": 3.205968823749572e-11, + "grad_norm": 7.234290599822998, + "learning_rate": 9.882869692532943e-09, + "loss": 1.4915, + "mean_token_accuracy": 0.6120007038116455, + "num_tokens": 737085.0, + "step": 28 + }, + { + "epoch": 0.0031847133757961785, + "ewc_loss": 3.979039320256561e-11, + "grad_norm": 8.078184127807617, + "learning_rate": 1.0248901903367497e-08, + "loss": 1.5751, + "mean_token_accuracy": 0.6030700206756592, + "num_tokens": 759817.0, + "step": 29 + }, + { + "epoch": 0.0032945310784098397, + "ewc_loss": 4.4792614062316716e-11, + "grad_norm": 7.589539051055908, + "learning_rate": 1.0614934114202049e-08, + "loss": 1.4181, + "mean_token_accuracy": 0.6262956261634827, + "num_tokens": 783353.0, + "step": 30 + }, + { + "epoch": 0.003404348781023501, + "ewc_loss": 4.843059286940843e-11, + "grad_norm": 8.340238571166992, + "learning_rate": 1.0980966325036603e-08, + "loss": 1.4706, + "mean_token_accuracy": 0.6185628771781921, + "num_tokens": 804197.0, + "step": 31 + }, + { + "epoch": 0.0035141664836371624, + "ewc_loss": 5.070432962384075e-11, + "grad_norm": 6.862918853759766, + "learning_rate": 1.1346998535871157e-08, + "loss": 1.5126, + "mean_token_accuracy": 0.6060153841972351, + "num_tokens": 830796.0, + "step": 32 + }, + { + "epoch": 0.0036239841862508236, + "ewc_loss": 5.411493475548923e-11, + "grad_norm": 6.579805850982666, + "learning_rate": 1.171303074670571e-08, + "loss": 1.4164, + "mean_token_accuracy": 0.6413529515266418, + "num_tokens": 858092.0, + "step": 33 + }, + { + "epoch": 0.0037338018888644848, + "ewc_loss": 6.048139766789973e-11, + "grad_norm": 7.303385257720947, + "learning_rate": 1.2079062957540264e-08, + "loss": 1.4555, + "mean_token_accuracy": 0.6133275032043457, + "num_tokens": 882474.0, + "step": 34 + }, + { + "epoch": 0.0038436195914781464, + "ewc_loss": 6.366462912410498e-11, + "grad_norm": 8.532353401184082, + "learning_rate": 1.2445095168374816e-08, + "loss": 1.457, + "mean_token_accuracy": 0.6255998611450195, + "num_tokens": 902361.0, + "step": 35 + }, + { + "epoch": 0.003953437294091808, + "ewc_loss": 6.866684998385608e-11, + "grad_norm": 7.349920272827148, + "learning_rate": 1.281112737920937e-08, + "loss": 1.5081, + "mean_token_accuracy": 0.6057035326957703, + "num_tokens": 927171.0, + "step": 36 + }, + { + "epoch": 0.004063254996705469, + "ewc_loss": 7.275957614183426e-11, + "grad_norm": 7.075529098510742, + "learning_rate": 1.3177159590043924e-08, + "loss": 1.4719, + "mean_token_accuracy": 0.6121363639831543, + "num_tokens": 952048.0, + "step": 37 + }, + { + "epoch": 0.00417307269931913, + "ewc_loss": 1.673470251262188e-10, + "grad_norm": 6.849883556365967, + "learning_rate": 1.3543191800878477e-08, + "loss": 1.574, + "mean_token_accuracy": 0.596491813659668, + "num_tokens": 981040.0, + "step": 38 + }, + { + "epoch": 0.004282890401932792, + "ewc_loss": 2.473825588822365e-10, + "grad_norm": 6.892486095428467, + "learning_rate": 1.390922401171303e-08, + "loss": 1.4834, + "mean_token_accuracy": 0.6070586442947388, + "num_tokens": 1008756.0, + "step": 39 + }, + { + "epoch": 0.004392708104546453, + "ewc_loss": 2.9467628337442875e-10, + "grad_norm": 7.084540843963623, + "learning_rate": 1.4275256222547585e-08, + "loss": 1.4987, + "mean_token_accuracy": 0.6016949415206909, + "num_tokens": 1035140.0, + "step": 40 + }, + { + "epoch": 0.004502525807160114, + "ewc_loss": 3.2014213502407074e-10, + "grad_norm": 7.7927374839782715, + "learning_rate": 1.4641288433382137e-08, + "loss": 1.5251, + "mean_token_accuracy": 0.602307915687561, + "num_tokens": 1058792.0, + "step": 41 + }, + { + "epoch": 0.004612343509773776, + "ewc_loss": 3.346940502524376e-10, + "grad_norm": 5.659409523010254, + "learning_rate": 1.500732064421669e-08, + "loss": 1.4629, + "mean_token_accuracy": 0.6146625876426697, + "num_tokens": 1096383.0, + "step": 42 + }, + { + "epoch": 0.0047221612123874365, + "ewc_loss": 3.54702933691442e-10, + "grad_norm": 6.7932586669921875, + "learning_rate": 1.5373352855051243e-08, + "loss": 1.5747, + "mean_token_accuracy": 0.5915776491165161, + "num_tokens": 1125313.0, + "step": 43 + }, + { + "epoch": 0.004831978915001098, + "ewc_loss": 3.6925484891980886e-10, + "grad_norm": 6.298901081085205, + "learning_rate": 1.57393850658858e-08, + "loss": 1.4454, + "mean_token_accuracy": 0.6231790781021118, + "num_tokens": 1155650.0, + "step": 44 + }, + { + "epoch": 0.00494179661761476, + "ewc_loss": 3.80168785341084e-10, + "grad_norm": 7.56752347946167, + "learning_rate": 1.610541727672035e-08, + "loss": 1.4676, + "mean_token_accuracy": 0.6197543740272522, + "num_tokens": 1178843.0, + "step": 45 + }, + { + "epoch": 0.0050516143202284204, + "ewc_loss": 4.05634636990726e-10, + "grad_norm": 6.922905445098877, + "learning_rate": 1.6471449487554904e-08, + "loss": 1.5239, + "mean_token_accuracy": 0.6030544638633728, + "num_tokens": 1206271.0, + "step": 46 + }, + { + "epoch": 0.005161432022842082, + "ewc_loss": 4.474713932722807e-10, + "grad_norm": 5.847174644470215, + "learning_rate": 1.6837481698389456e-08, + "loss": 1.4709, + "mean_token_accuracy": 0.6167130470275879, + "num_tokens": 1241401.0, + "step": 47 + }, + { + "epoch": 0.005271249725455744, + "ewc_loss": 4.656612873077393e-10, + "grad_norm": 7.176879405975342, + "learning_rate": 1.7203513909224012e-08, + "loss": 1.4964, + "mean_token_accuracy": 0.611149787902832, + "num_tokens": 1267455.0, + "step": 48 + }, + { + "epoch": 0.005381067428069404, + "ewc_loss": 4.838511813431978e-10, + "grad_norm": 6.79125452041626, + "learning_rate": 1.7569546120058564e-08, + "loss": 1.3769, + "mean_token_accuracy": 0.6374930143356323, + "num_tokens": 1293955.0, + "step": 49 + }, + { + "epoch": 0.005490885130683066, + "ewc_loss": 5.093170329928398e-10, + "grad_norm": 7.762787818908691, + "learning_rate": 1.7935578330893117e-08, + "loss": 1.4396, + "mean_token_accuracy": 0.6285207271575928, + "num_tokens": 1316419.0, + "step": 50 + }, + { + "epoch": 0.0056007028332967276, + "ewc_loss": 5.238689482212067e-10, + "grad_norm": 7.735241889953613, + "learning_rate": 1.8301610541727673e-08, + "loss": 1.5092, + "mean_token_accuracy": 0.6068658232688904, + "num_tokens": 1338607.0, + "step": 51 + }, + { + "epoch": 0.005710520535910389, + "ewc_loss": 5.566107574850321e-10, + "grad_norm": 6.881763458251953, + "learning_rate": 1.8667642752562225e-08, + "loss": 1.4858, + "mean_token_accuracy": 0.608428955078125, + "num_tokens": 1366081.0, + "step": 52 + }, + { + "epoch": 0.00582033823852405, + "ewc_loss": 1.0550138540565968e-09, + "grad_norm": 7.397276401519775, + "learning_rate": 1.9033674963396777e-08, + "loss": 1.4825, + "mean_token_accuracy": 0.61363285779953, + "num_tokens": 1390090.0, + "step": 53 + }, + { + "epoch": 0.0059301559411377115, + "ewc_loss": 1.6298145055770874e-09, + "grad_norm": 7.4393696784973145, + "learning_rate": 1.9399707174231333e-08, + "loss": 1.4928, + "mean_token_accuracy": 0.610763430595398, + "num_tokens": 1413964.0, + "step": 54 + }, + { + "epoch": 0.006039973643751373, + "ewc_loss": 1.862645149230957e-09, + "grad_norm": 6.556291103363037, + "learning_rate": 1.9765739385065885e-08, + "loss": 1.5148, + "mean_token_accuracy": 0.6097484827041626, + "num_tokens": 1442849.0, + "step": 55 + }, + { + "epoch": 0.006149791346365034, + "ewc_loss": 2.0372681319713593e-09, + "grad_norm": 6.161547660827637, + "learning_rate": 2.0131771595900438e-08, + "loss": 1.5726, + "mean_token_accuracy": 0.5892431139945984, + "num_tokens": 1478493.0, + "step": 56 + }, + { + "epoch": 0.006259609048978695, + "ewc_loss": 2.2118911147117615e-09, + "grad_norm": 8.156291007995605, + "learning_rate": 2.0497803806734994e-08, + "loss": 1.5257, + "mean_token_accuracy": 0.616569995880127, + "num_tokens": 1498811.0, + "step": 57 + }, + { + "epoch": 0.006369426751592357, + "ewc_loss": 2.3137545213103294e-09, + "grad_norm": 6.748773574829102, + "learning_rate": 2.0863836017569546e-08, + "loss": 1.4464, + "mean_token_accuracy": 0.6246594190597534, + "num_tokens": 1525932.0, + "step": 58 + }, + { + "epoch": 0.006479244454206018, + "ewc_loss": 2.4010660126805305e-09, + "grad_norm": 7.322622299194336, + "learning_rate": 2.1229868228404098e-08, + "loss": 1.4911, + "mean_token_accuracy": 0.6093912124633789, + "num_tokens": 1550438.0, + "step": 59 + }, + { + "epoch": 0.006589062156819679, + "ewc_loss": 2.5029294192790985e-09, + "grad_norm": 8.592363357543945, + "learning_rate": 2.1595900439238654e-08, + "loss": 1.5009, + "mean_token_accuracy": 0.6130332946777344, + "num_tokens": 1569961.0, + "step": 60 + }, + { + "epoch": 0.006698879859433341, + "ewc_loss": 2.5902409106492996e-09, + "grad_norm": 5.989620208740234, + "learning_rate": 2.1961932650073206e-08, + "loss": 1.4965, + "mean_token_accuracy": 0.6061667203903198, + "num_tokens": 1604631.0, + "step": 61 + }, + { + "epoch": 0.006808697562047002, + "ewc_loss": 2.648448571562767e-09, + "grad_norm": 7.568010330200195, + "learning_rate": 2.232796486090776e-08, + "loss": 1.4842, + "mean_token_accuracy": 0.6135477423667908, + "num_tokens": 1628098.0, + "step": 62 + }, + { + "epoch": 0.006918515264660663, + "ewc_loss": 2.735760062932968e-09, + "grad_norm": 6.587136268615723, + "learning_rate": 2.2693997071742314e-08, + "loss": 1.48, + "mean_token_accuracy": 0.620633065700531, + "num_tokens": 1656281.0, + "step": 63 + }, + { + "epoch": 0.007028332967274325, + "ewc_loss": 2.7794158086180687e-09, + "grad_norm": 7.344973087310791, + "learning_rate": 2.3060029282576867e-08, + "loss": 1.5131, + "mean_token_accuracy": 0.6157743334770203, + "num_tokens": 1680754.0, + "step": 64 + }, + { + "epoch": 0.007138150669887986, + "ewc_loss": 2.9103830456733704e-09, + "grad_norm": 7.5652337074279785, + "learning_rate": 2.342606149341142e-08, + "loss": 1.5117, + "mean_token_accuracy": 0.6081300973892212, + "num_tokens": 1704446.0, + "step": 65 + }, + { + "epoch": 0.007247968372501647, + "ewc_loss": 3.14321368932724e-09, + "grad_norm": 6.817201614379883, + "learning_rate": 2.3792093704245972e-08, + "loss": 1.2967, + "mean_token_accuracy": 0.6493930220603943, + "num_tokens": 1728711.0, + "step": 66 + }, + { + "epoch": 0.007357786075115309, + "ewc_loss": 3.2159732654690742e-09, + "grad_norm": 7.829528331756592, + "learning_rate": 2.4158125915080527e-08, + "loss": 1.4219, + "mean_token_accuracy": 0.6315031051635742, + "num_tokens": 1750331.0, + "step": 67 + }, + { + "epoch": 0.0074676037777289695, + "ewc_loss": 3.3614924177527428e-09, + "grad_norm": 6.240300178527832, + "learning_rate": 2.452415812591508e-08, + "loss": 1.4365, + "mean_token_accuracy": 0.6179032325744629, + "num_tokens": 1781737.0, + "step": 68 + }, + { + "epoch": 0.007577421480342631, + "ewc_loss": 3.4924596548080444e-09, + "grad_norm": 6.467519760131836, + "learning_rate": 2.4890190336749632e-08, + "loss": 1.4812, + "mean_token_accuracy": 0.6121597290039062, + "num_tokens": 1811021.0, + "step": 69 + }, + { + "epoch": 0.007687239182956293, + "ewc_loss": 3.5652192309498787e-09, + "grad_norm": 7.426461219787598, + "learning_rate": 2.5256222547584188e-08, + "loss": 1.5343, + "mean_token_accuracy": 0.5972800254821777, + "num_tokens": 1835564.0, + "step": 70 + }, + { + "epoch": 0.007797056885569954, + "ewc_loss": 3.65253072232008e-09, + "grad_norm": 10.513062477111816, + "learning_rate": 2.562225475841874e-08, + "loss": 1.5246, + "mean_token_accuracy": 0.6142159104347229, + "num_tokens": 1850951.0, + "step": 71 + }, + { + "epoch": 0.007906874588183616, + "ewc_loss": 3.725290298461914e-09, + "grad_norm": 7.011465072631836, + "learning_rate": 2.5988286969253293e-08, + "loss": 1.5375, + "mean_token_accuracy": 0.6024121046066284, + "num_tokens": 1877690.0, + "step": 72 + }, + { + "epoch": 0.008016692290797276, + "ewc_loss": 3.841705620288849e-09, + "grad_norm": 6.916315078735352, + "learning_rate": 2.6354319180087848e-08, + "loss": 1.5773, + "mean_token_accuracy": 0.5913915634155273, + "num_tokens": 1904455.0, + "step": 73 + }, + { + "epoch": 0.008126509993410937, + "ewc_loss": 5.908077582716942e-09, + "grad_norm": 7.703507423400879, + "learning_rate": 2.67203513909224e-08, + "loss": 1.6072, + "mean_token_accuracy": 0.5873064994812012, + "num_tokens": 1928049.0, + "step": 74 + }, + { + "epoch": 0.008236327696024599, + "ewc_loss": 8.440110832452774e-09, + "grad_norm": 8.008210182189941, + "learning_rate": 2.7086383601756953e-08, + "loss": 1.4996, + "mean_token_accuracy": 0.6139552593231201, + "num_tokens": 1948422.0, + "step": 75 + }, + { + "epoch": 0.00834614539863826, + "ewc_loss": 1.0186340659856796e-08, + "grad_norm": 6.8755083084106445, + "learning_rate": 2.745241581259151e-08, + "loss": 1.4679, + "mean_token_accuracy": 0.6182062029838562, + "num_tokens": 1974329.0, + "step": 76 + }, + { + "epoch": 0.008455963101251922, + "ewc_loss": 1.1117663234472275e-08, + "grad_norm": 6.624422550201416, + "learning_rate": 2.781844802342606e-08, + "loss": 1.4229, + "mean_token_accuracy": 0.6306562423706055, + "num_tokens": 2000531.0, + "step": 77 + }, + { + "epoch": 0.008565780803865584, + "ewc_loss": 1.1874362826347351e-08, + "grad_norm": 6.9509453773498535, + "learning_rate": 2.8184480234260614e-08, + "loss": 1.4783, + "mean_token_accuracy": 0.6131646037101746, + "num_tokens": 2023823.0, + "step": 78 + }, + { + "epoch": 0.008675598506479244, + "ewc_loss": 1.234002411365509e-08, + "grad_norm": 7.171600341796875, + "learning_rate": 2.855051244509517e-08, + "loss": 1.514, + "mean_token_accuracy": 0.6027036905288696, + "num_tokens": 2048734.0, + "step": 79 + }, + { + "epoch": 0.008785416209092905, + "ewc_loss": 1.2863893061876297e-08, + "grad_norm": 6.497890949249268, + "learning_rate": 2.8916544655929722e-08, + "loss": 1.4144, + "mean_token_accuracy": 0.6263729333877563, + "num_tokens": 2075741.0, + "step": 80 + }, + { + "epoch": 0.008895233911706567, + "ewc_loss": 1.3154931366443634e-08, + "grad_norm": 7.211454391479492, + "learning_rate": 2.9282576866764274e-08, + "loss": 1.5168, + "mean_token_accuracy": 0.6027291417121887, + "num_tokens": 2100247.0, + "step": 81 + }, + { + "epoch": 0.009005051614320228, + "ewc_loss": 1.3620592653751373e-08, + "grad_norm": 7.535137176513672, + "learning_rate": 2.964860907759883e-08, + "loss": 1.5992, + "mean_token_accuracy": 0.6030958294868469, + "num_tokens": 2122939.0, + "step": 82 + }, + { + "epoch": 0.00911486931693389, + "ewc_loss": 1.4086253941059113e-08, + "grad_norm": 8.628308296203613, + "learning_rate": 3.001464128843338e-08, + "loss": 1.4995, + "mean_token_accuracy": 0.6035601496696472, + "num_tokens": 2141424.0, + "step": 83 + }, + { + "epoch": 0.009224687019547552, + "ewc_loss": 1.4493707567453384e-08, + "grad_norm": 6.075673580169678, + "learning_rate": 3.038067349926794e-08, + "loss": 1.3978, + "mean_token_accuracy": 0.6278001070022583, + "num_tokens": 2171269.0, + "step": 84 + }, + { + "epoch": 0.009334504722161213, + "ewc_loss": 1.4726538211107254e-08, + "grad_norm": 6.592716217041016, + "learning_rate": 3.074670571010249e-08, + "loss": 1.3789, + "mean_token_accuracy": 0.6459140181541443, + "num_tokens": 2195892.0, + "step": 85 + }, + { + "epoch": 0.009444322424774873, + "ewc_loss": 1.501757651567459e-08, + "grad_norm": 5.923676013946533, + "learning_rate": 3.111273792093704e-08, + "loss": 1.4783, + "mean_token_accuracy": 0.6196292042732239, + "num_tokens": 2227992.0, + "step": 86 + }, + { + "epoch": 0.009554140127388535, + "ewc_loss": 1.501757651567459e-08, + "grad_norm": 6.360686302185059, + "learning_rate": 3.14787701317716e-08, + "loss": 1.535, + "mean_token_accuracy": 0.6106334924697876, + "num_tokens": 2256995.0, + "step": 87 + }, + { + "epoch": 0.009663957830002196, + "ewc_loss": 1.525040715932846e-08, + "grad_norm": 5.993893623352051, + "learning_rate": 3.184480234260615e-08, + "loss": 1.4486, + "mean_token_accuracy": 0.6219351291656494, + "num_tokens": 2286492.0, + "step": 88 + }, + { + "epoch": 0.009773775532615858, + "ewc_loss": 1.548323780298233e-08, + "grad_norm": 7.524285793304443, + "learning_rate": 3.22108345534407e-08, + "loss": 1.5042, + "mean_token_accuracy": 0.6077011823654175, + "num_tokens": 2309004.0, + "step": 89 + }, + { + "epoch": 0.00988359323522952, + "ewc_loss": 1.5599653124809265e-08, + "grad_norm": 6.127191066741943, + "learning_rate": 3.257686676427526e-08, + "loss": 1.4032, + "mean_token_accuracy": 0.6204712390899658, + "num_tokens": 2336480.0, + "step": 90 + }, + { + "epoch": 0.009993410937843181, + "ewc_loss": 1.6298145055770874e-08, + "grad_norm": 6.427892684936523, + "learning_rate": 3.294289897510981e-08, + "loss": 1.4478, + "mean_token_accuracy": 0.6148518323898315, + "num_tokens": 2363526.0, + "step": 91 + }, + { + "epoch": 0.010103228640456841, + "ewc_loss": 1.6763806343078613e-08, + "grad_norm": 6.051037788391113, + "learning_rate": 3.3308931185944364e-08, + "loss": 1.399, + "mean_token_accuracy": 0.6280444860458374, + "num_tokens": 2393847.0, + "step": 92 + }, + { + "epoch": 0.010213046343070502, + "ewc_loss": 1.7345882952213287e-08, + "grad_norm": 6.423926830291748, + "learning_rate": 3.367496339677891e-08, + "loss": 1.4982, + "mean_token_accuracy": 0.6060570478439331, + "num_tokens": 2421639.0, + "step": 93 + }, + { + "epoch": 0.010322864045684164, + "ewc_loss": 1.7695128917694092e-08, + "grad_norm": 6.7677388191223145, + "learning_rate": 3.404099560761347e-08, + "loss": 1.451, + "mean_token_accuracy": 0.6234097480773926, + "num_tokens": 2445896.0, + "step": 94 + }, + { + "epoch": 0.010432681748297826, + "ewc_loss": 1.8277205526828766e-08, + "grad_norm": 6.352190971374512, + "learning_rate": 3.4407027818448024e-08, + "loss": 1.5575, + "mean_token_accuracy": 0.6006790399551392, + "num_tokens": 2475814.0, + "step": 95 + }, + { + "epoch": 0.010542499450911487, + "ewc_loss": 1.8742866814136505e-08, + "grad_norm": 6.957694053649902, + "learning_rate": 3.477306002928257e-08, + "loss": 1.3838, + "mean_token_accuracy": 0.6394164562225342, + "num_tokens": 2499512.0, + "step": 96 + }, + { + "epoch": 0.010652317153525149, + "ewc_loss": 1.932494342327118e-08, + "grad_norm": 6.3415350914001465, + "learning_rate": 3.513909224011713e-08, + "loss": 1.5604, + "mean_token_accuracy": 0.5937451124191284, + "num_tokens": 2528412.0, + "step": 97 + }, + { + "epoch": 0.010762134856138809, + "ewc_loss": 1.9441358745098114e-08, + "grad_norm": 6.909445762634277, + "learning_rate": 3.5505124450951685e-08, + "loss": 1.5042, + "mean_token_accuracy": 0.6104564666748047, + "num_tokens": 2553855.0, + "step": 98 + }, + { + "epoch": 0.01087195255875247, + "ewc_loss": 2.0023435354232788e-08, + "grad_norm": 7.915687561035156, + "learning_rate": 3.5871156661786234e-08, + "loss": 1.5372, + "mean_token_accuracy": 0.6071147918701172, + "num_tokens": 2574996.0, + "step": 99 + }, + { + "epoch": 0.010981770261366132, + "ewc_loss": 2.0372681319713593e-08, + "grad_norm": 6.51836633682251, + "learning_rate": 3.623718887262079e-08, + "loss": 1.4905, + "mean_token_accuracy": 0.6126466393470764, + "num_tokens": 2601684.0, + "step": 100 + }, + { + "epoch": 0.011091587963979794, + "ewc_loss": 2.130400389432907e-08, + "grad_norm": 6.74946403503418, + "learning_rate": 3.6603221083455345e-08, + "loss": 1.5375, + "mean_token_accuracy": 0.6011444926261902, + "num_tokens": 2628612.0, + "step": 101 + }, + { + "epoch": 0.011201405666593455, + "ewc_loss": 2.293381839990616e-08, + "grad_norm": 6.646988868713379, + "learning_rate": 3.6969253294289894e-08, + "loss": 1.4371, + "mean_token_accuracy": 0.6367395520210266, + "num_tokens": 2652776.0, + "step": 102 + }, + { + "epoch": 0.011311223369207117, + "ewc_loss": 2.6542693376541138e-08, + "grad_norm": 6.68102502822876, + "learning_rate": 3.733528550512445e-08, + "loss": 1.4051, + "mean_token_accuracy": 0.6313167810440063, + "num_tokens": 2677807.0, + "step": 103 + }, + { + "epoch": 0.011421041071820778, + "ewc_loss": 3.119930624961853e-08, + "grad_norm": 8.08180046081543, + "learning_rate": 3.7701317715959006e-08, + "loss": 1.5531, + "mean_token_accuracy": 0.6000133752822876, + "num_tokens": 2697989.0, + "step": 104 + }, + { + "epoch": 0.011530858774434438, + "ewc_loss": 3.725290298461914e-08, + "grad_norm": 6.713543891906738, + "learning_rate": 3.8067349926793555e-08, + "loss": 1.5395, + "mean_token_accuracy": 0.6026133298873901, + "num_tokens": 2722341.0, + "step": 105 + }, + { + "epoch": 0.0116406764770481, + "ewc_loss": 4.0745362639427185e-08, + "grad_norm": 5.837952136993408, + "learning_rate": 3.843338213762811e-08, + "loss": 1.5179, + "mean_token_accuracy": 0.602268636226654, + "num_tokens": 2753675.0, + "step": 106 + }, + { + "epoch": 0.011750494179661761, + "ewc_loss": 4.423782229423523e-08, + "grad_norm": 6.4248881340026855, + "learning_rate": 3.8799414348462666e-08, + "loss": 1.465, + "mean_token_accuracy": 0.613270103931427, + "num_tokens": 2778974.0, + "step": 107 + }, + { + "epoch": 0.011860311882275423, + "ewc_loss": 4.563480615615845e-08, + "grad_norm": 5.913054943084717, + "learning_rate": 3.9165446559297215e-08, + "loss": 1.5169, + "mean_token_accuracy": 0.602604329586029, + "num_tokens": 2808967.0, + "step": 108 + }, + { + "epoch": 0.011970129584889085, + "ewc_loss": 4.7031790018081665e-08, + "grad_norm": 6.289169788360596, + "learning_rate": 3.953147877013177e-08, + "loss": 1.5108, + "mean_token_accuracy": 0.6011351943016052, + "num_tokens": 2835393.0, + "step": 109 + }, + { + "epoch": 0.012079947287502746, + "ewc_loss": 4.7963112592697144e-08, + "grad_norm": 5.7157487869262695, + "learning_rate": 3.9897510980966327e-08, + "loss": 1.472, + "mean_token_accuracy": 0.6121635437011719, + "num_tokens": 2863678.0, + "step": 110 + }, + { + "epoch": 0.012189764990116406, + "ewc_loss": 4.842877388000488e-08, + "grad_norm": 6.161304950714111, + "learning_rate": 4.0263543191800876e-08, + "loss": 1.4444, + "mean_token_accuracy": 0.6229085922241211, + "num_tokens": 2890376.0, + "step": 111 + }, + { + "epoch": 0.012299582692730068, + "ewc_loss": 4.936009645462036e-08, + "grad_norm": 6.0506768226623535, + "learning_rate": 4.062957540263543e-08, + "loss": 1.4412, + "mean_token_accuracy": 0.6237902641296387, + "num_tokens": 2917128.0, + "step": 112 + }, + { + "epoch": 0.01240940039534373, + "ewc_loss": 4.959292709827423e-08, + "grad_norm": 6.60575532913208, + "learning_rate": 4.099560761346999e-08, + "loss": 1.4756, + "mean_token_accuracy": 0.6103695034980774, + "num_tokens": 2940812.0, + "step": 113 + }, + { + "epoch": 0.01251921809795739, + "ewc_loss": 5.029141902923584e-08, + "grad_norm": 5.929093837738037, + "learning_rate": 4.1361639824304536e-08, + "loss": 1.5022, + "mean_token_accuracy": 0.6018791198730469, + "num_tokens": 2969881.0, + "step": 114 + }, + { + "epoch": 0.012629035800571052, + "ewc_loss": 5.075708031654358e-08, + "grad_norm": 6.346651077270508, + "learning_rate": 4.172767203513909e-08, + "loss": 1.5013, + "mean_token_accuracy": 0.6086922883987427, + "num_tokens": 2996049.0, + "step": 115 + }, + { + "epoch": 0.012738853503184714, + "ewc_loss": 5.192123353481293e-08, + "grad_norm": 5.432767391204834, + "learning_rate": 4.209370424597365e-08, + "loss": 1.3714, + "mean_token_accuracy": 0.632562518119812, + "num_tokens": 3025829.0, + "step": 116 + }, + { + "epoch": 0.012848671205798374, + "ewc_loss": 5.21540641784668e-08, + "grad_norm": 6.425621032714844, + "learning_rate": 4.2459736456808197e-08, + "loss": 1.4881, + "mean_token_accuracy": 0.6112613081932068, + "num_tokens": 3051012.0, + "step": 117 + }, + { + "epoch": 0.012958488908412035, + "ewc_loss": 5.2852556109428406e-08, + "grad_norm": 6.1243577003479, + "learning_rate": 4.282576866764275e-08, + "loss": 1.5295, + "mean_token_accuracy": 0.6048758625984192, + "num_tokens": 3077750.0, + "step": 118 + }, + { + "epoch": 0.013068306611025697, + "ewc_loss": 5.3085386753082275e-08, + "grad_norm": 6.3167805671691895, + "learning_rate": 4.319180087847731e-08, + "loss": 1.4793, + "mean_token_accuracy": 0.6090260744094849, + "num_tokens": 3103271.0, + "step": 119 + }, + { + "epoch": 0.013178124313639359, + "ewc_loss": 5.4016709327697754e-08, + "grad_norm": 7.032378196716309, + "learning_rate": 4.355783308931186e-08, + "loss": 1.5065, + "mean_token_accuracy": 0.6018191576004028, + "num_tokens": 3124511.0, + "step": 120 + }, + { + "epoch": 0.01328794201625302, + "ewc_loss": 5.471520125865936e-08, + "grad_norm": 6.118625640869141, + "learning_rate": 4.392386530014641e-08, + "loss": 1.4022, + "mean_token_accuracy": 0.6355272531509399, + "num_tokens": 3150451.0, + "step": 121 + }, + { + "epoch": 0.013397759718866682, + "ewc_loss": 5.587935447692871e-08, + "grad_norm": 5.80293083190918, + "learning_rate": 4.428989751098097e-08, + "loss": 1.5193, + "mean_token_accuracy": 0.6022583246231079, + "num_tokens": 3180151.0, + "step": 122 + }, + { + "epoch": 0.013507577421480343, + "ewc_loss": 5.704350769519806e-08, + "grad_norm": 6.032867431640625, + "learning_rate": 4.465592972181552e-08, + "loss": 1.513, + "mean_token_accuracy": 0.6060147285461426, + "num_tokens": 3207702.0, + "step": 123 + }, + { + "epoch": 0.013617395124094003, + "ewc_loss": 5.8906152844429016e-08, + "grad_norm": 7.666754722595215, + "learning_rate": 4.502196193265007e-08, + "loss": 1.3942, + "mean_token_accuracy": 0.6379348635673523, + "num_tokens": 3226168.0, + "step": 124 + }, + { + "epoch": 0.013727212826707665, + "ewc_loss": 5.960464477539063e-08, + "grad_norm": 5.926448345184326, + "learning_rate": 4.538799414348463e-08, + "loss": 1.4774, + "mean_token_accuracy": 0.6078262329101562, + "num_tokens": 3255714.0, + "step": 125 + }, + { + "epoch": 0.013837030529321327, + "ewc_loss": 6.100162863731384e-08, + "grad_norm": 6.011061191558838, + "learning_rate": 4.575402635431918e-08, + "loss": 1.5153, + "mean_token_accuracy": 0.6085119843482971, + "num_tokens": 3283574.0, + "step": 126 + }, + { + "epoch": 0.013946848231934988, + "ewc_loss": 6.193295121192932e-08, + "grad_norm": 6.161673069000244, + "learning_rate": 4.6120058565153734e-08, + "loss": 1.4586, + "mean_token_accuracy": 0.6192654371261597, + "num_tokens": 3309128.0, + "step": 127 + }, + { + "epoch": 0.01405666593454865, + "ewc_loss": 6.332993507385254e-08, + "grad_norm": 6.983931064605713, + "learning_rate": 4.648609077598828e-08, + "loss": 1.5119, + "mean_token_accuracy": 0.6047614812850952, + "num_tokens": 3331577.0, + "step": 128 + }, + { + "epoch": 0.014166483637162311, + "ewc_loss": 6.51925802230835e-08, + "grad_norm": 6.175878524780273, + "learning_rate": 4.685212298682284e-08, + "loss": 1.567, + "mean_token_accuracy": 0.5891242623329163, + "num_tokens": 3358884.0, + "step": 129 + }, + { + "epoch": 0.014276301339775971, + "ewc_loss": 6.752088665962219e-08, + "grad_norm": 7.863307476043701, + "learning_rate": 4.7218155197657394e-08, + "loss": 1.5927, + "mean_token_accuracy": 0.5891718864440918, + "num_tokens": 3378122.0, + "step": 130 + }, + { + "epoch": 0.014386119042389633, + "ewc_loss": 6.938353180885315e-08, + "grad_norm": 6.468194961547852, + "learning_rate": 4.7584187408491943e-08, + "loss": 1.5471, + "mean_token_accuracy": 0.6015856266021729, + "num_tokens": 3404642.0, + "step": 131 + }, + { + "epoch": 0.014495936745003294, + "ewc_loss": 7.078051567077637e-08, + "grad_norm": 6.4611005783081055, + "learning_rate": 4.79502196193265e-08, + "loss": 1.4406, + "mean_token_accuracy": 0.6150952577590942, + "num_tokens": 3429186.0, + "step": 132 + }, + { + "epoch": 0.014605754447616956, + "ewc_loss": 7.264316082000732e-08, + "grad_norm": 5.652223110198975, + "learning_rate": 4.8316251830161055e-08, + "loss": 1.4769, + "mean_token_accuracy": 0.6186650991439819, + "num_tokens": 3460810.0, + "step": 133 + }, + { + "epoch": 0.014715572150230618, + "ewc_loss": 7.35744833946228e-08, + "grad_norm": 5.564201831817627, + "learning_rate": 4.8682284040995604e-08, + "loss": 1.4437, + "mean_token_accuracy": 0.618206799030304, + "num_tokens": 3491285.0, + "step": 134 + }, + { + "epoch": 0.01482538985284428, + "ewc_loss": 7.450580596923828e-08, + "grad_norm": 6.009095191955566, + "learning_rate": 4.904831625183016e-08, + "loss": 1.4379, + "mean_token_accuracy": 0.6209105849266052, + "num_tokens": 3517893.0, + "step": 135 + }, + { + "epoch": 0.014935207555457939, + "ewc_loss": 7.59027898311615e-08, + "grad_norm": 6.102747917175293, + "learning_rate": 4.9414348462664715e-08, + "loss": 1.4893, + "mean_token_accuracy": 0.606765866279602, + "num_tokens": 3545614.0, + "step": 136 + }, + { + "epoch": 0.0150450252580716, + "ewc_loss": 7.729977369308472e-08, + "grad_norm": 6.5397772789001465, + "learning_rate": 4.9780380673499264e-08, + "loss": 1.5325, + "mean_token_accuracy": 0.5980149507522583, + "num_tokens": 3571156.0, + "step": 137 + }, + { + "epoch": 0.015154842960685262, + "ewc_loss": 8.102506399154663e-08, + "grad_norm": 6.500133514404297, + "learning_rate": 5.014641288433382e-08, + "loss": 1.4284, + "mean_token_accuracy": 0.6233725547790527, + "num_tokens": 3595596.0, + "step": 138 + }, + { + "epoch": 0.015264660663298924, + "ewc_loss": 8.475035429000854e-08, + "grad_norm": 5.691341876983643, + "learning_rate": 5.0512445095168376e-08, + "loss": 1.4415, + "mean_token_accuracy": 0.6331937313079834, + "num_tokens": 3625562.0, + "step": 139 + }, + { + "epoch": 0.015374478365912585, + "ewc_loss": 8.847564458847046e-08, + "grad_norm": 6.667827606201172, + "learning_rate": 5.0878477306002925e-08, + "loss": 1.4942, + "mean_token_accuracy": 0.6200010180473328, + "num_tokens": 3651717.0, + "step": 140 + }, + { + "epoch": 0.015484296068526247, + "ewc_loss": 9.359791874885559e-08, + "grad_norm": 6.767282009124756, + "learning_rate": 5.124450951683748e-08, + "loss": 1.3767, + "mean_token_accuracy": 0.6372277736663818, + "num_tokens": 3674773.0, + "step": 141 + }, + { + "epoch": 0.015594113771139909, + "ewc_loss": 1.0011717677116394e-07, + "grad_norm": 6.5189361572265625, + "learning_rate": 5.1610541727672036e-08, + "loss": 1.5606, + "mean_token_accuracy": 0.5894284248352051, + "num_tokens": 3701717.0, + "step": 142 + }, + { + "epoch": 0.01570393147375357, + "ewc_loss": 1.0710209608078003e-07, + "grad_norm": 5.676351547241211, + "learning_rate": 5.1976573938506585e-08, + "loss": 1.4076, + "mean_token_accuracy": 0.6261364817619324, + "num_tokens": 3729901.0, + "step": 143 + }, + { + "epoch": 0.015813749176367232, + "ewc_loss": 1.126900315284729e-07, + "grad_norm": 5.693291187286377, + "learning_rate": 5.234260614934114e-08, + "loss": 1.5075, + "mean_token_accuracy": 0.5947962999343872, + "num_tokens": 3757405.0, + "step": 144 + }, + { + "epoch": 0.01592356687898089, + "ewc_loss": 1.1920928955078125e-07, + "grad_norm": 5.3895649909973145, + "learning_rate": 5.2708638360175697e-08, + "loss": 1.3701, + "mean_token_accuracy": 0.6374711990356445, + "num_tokens": 3787289.0, + "step": 145 + }, + { + "epoch": 0.01603338458159455, + "ewc_loss": 1.2014061212539673e-07, + "grad_norm": 5.819759845733643, + "learning_rate": 5.3074670571010246e-08, + "loss": 1.4053, + "mean_token_accuracy": 0.6324830055236816, + "num_tokens": 3811919.0, + "step": 146 + }, + { + "epoch": 0.016143202284208215, + "ewc_loss": 1.2293457984924316e-07, + "grad_norm": 5.872399806976318, + "learning_rate": 5.34407027818448e-08, + "loss": 1.3469, + "mean_token_accuracy": 0.6399953961372375, + "num_tokens": 3835967.0, + "step": 147 + }, + { + "epoch": 0.016253019986821875, + "ewc_loss": 1.2665987014770508e-07, + "grad_norm": 6.003317356109619, + "learning_rate": 5.380673499267936e-08, + "loss": 1.3926, + "mean_token_accuracy": 0.6275472640991211, + "num_tokens": 3859268.0, + "step": 148 + }, + { + "epoch": 0.016362837689435538, + "ewc_loss": 1.2945383787155151e-07, + "grad_norm": 4.868227005004883, + "learning_rate": 5.4172767203513906e-08, + "loss": 1.4436, + "mean_token_accuracy": 0.6142300963401794, + "num_tokens": 3892417.0, + "step": 149 + }, + { + "epoch": 0.016472655392049198, + "ewc_loss": 1.3131648302078247e-07, + "grad_norm": 6.536330223083496, + "learning_rate": 5.453879941434846e-08, + "loss": 1.3688, + "mean_token_accuracy": 0.6372878551483154, + "num_tokens": 3913323.0, + "step": 150 + }, + { + "epoch": 0.01658247309466286, + "ewc_loss": 1.341104507446289e-07, + "grad_norm": 4.784927845001221, + "learning_rate": 5.490483162518302e-08, + "loss": 1.264, + "mean_token_accuracy": 0.6556328535079956, + "num_tokens": 3942390.0, + "step": 151 + }, + { + "epoch": 0.01669229079727652, + "ewc_loss": 1.3597309589385986e-07, + "grad_norm": 5.567594051361084, + "learning_rate": 5.527086383601757e-08, + "loss": 1.4491, + "mean_token_accuracy": 0.613349199295044, + "num_tokens": 3968060.0, + "step": 152 + }, + { + "epoch": 0.01680210849989018, + "ewc_loss": 1.4156103134155273e-07, + "grad_norm": 4.817778587341309, + "learning_rate": 5.563689604685212e-08, + "loss": 1.3265, + "mean_token_accuracy": 0.6457346677780151, + "num_tokens": 3996149.0, + "step": 153 + }, + { + "epoch": 0.016911926202503844, + "ewc_loss": 1.4621764421463013e-07, + "grad_norm": 5.2074079513549805, + "learning_rate": 5.600292825768668e-08, + "loss": 1.4608, + "mean_token_accuracy": 0.6084411144256592, + "num_tokens": 4022708.0, + "step": 154 + }, + { + "epoch": 0.017021743905117504, + "ewc_loss": 1.4901161193847656e-07, + "grad_norm": 5.405880451202393, + "learning_rate": 5.636896046852123e-08, + "loss": 1.4956, + "mean_token_accuracy": 0.6090396046638489, + "num_tokens": 4048984.0, + "step": 155 + }, + { + "epoch": 0.017131561607731167, + "ewc_loss": 1.5273690223693848e-07, + "grad_norm": 5.009867191314697, + "learning_rate": 5.673499267935578e-08, + "loss": 1.3828, + "mean_token_accuracy": 0.630152702331543, + "num_tokens": 4075654.0, + "step": 156 + }, + { + "epoch": 0.017241379310344827, + "ewc_loss": 1.5366822481155396e-07, + "grad_norm": 4.96783971786499, + "learning_rate": 5.710102489019034e-08, + "loss": 1.5388, + "mean_token_accuracy": 0.5872930288314819, + "num_tokens": 4106959.0, + "step": 157 + }, + { + "epoch": 0.017351197012958487, + "ewc_loss": 1.5459954738616943e-07, + "grad_norm": 4.923403739929199, + "learning_rate": 5.746705710102489e-08, + "loss": 1.428, + "mean_token_accuracy": 0.6252778172492981, + "num_tokens": 4135143.0, + "step": 158 + }, + { + "epoch": 0.01746101471557215, + "ewc_loss": 1.555308699607849e-07, + "grad_norm": 4.819852828979492, + "learning_rate": 5.7833089311859443e-08, + "loss": 1.3701, + "mean_token_accuracy": 0.6364824771881104, + "num_tokens": 4162641.0, + "step": 159 + }, + { + "epoch": 0.01757083241818581, + "ewc_loss": 1.5832483768463135e-07, + "grad_norm": 4.909451484680176, + "learning_rate": 5.8199121522694e-08, + "loss": 1.3281, + "mean_token_accuracy": 0.6355273723602295, + "num_tokens": 4189265.0, + "step": 160 + }, + { + "epoch": 0.017680650120799474, + "ewc_loss": 1.5925616025924683e-07, + "grad_norm": 5.743012428283691, + "learning_rate": 5.856515373352855e-08, + "loss": 1.495, + "mean_token_accuracy": 0.6049274206161499, + "num_tokens": 4209765.0, + "step": 161 + }, + { + "epoch": 0.017790467823413134, + "ewc_loss": 1.6111880540847778e-07, + "grad_norm": 5.270928382873535, + "learning_rate": 5.8931185944363104e-08, + "loss": 1.3899, + "mean_token_accuracy": 0.6261841654777527, + "num_tokens": 4232444.0, + "step": 162 + }, + { + "epoch": 0.017900285526026797, + "ewc_loss": 1.6298145055770874e-07, + "grad_norm": 4.39259147644043, + "learning_rate": 5.929721815519766e-08, + "loss": 1.3935, + "mean_token_accuracy": 0.6207857131958008, + "num_tokens": 4263314.0, + "step": 163 + }, + { + "epoch": 0.018010103228640457, + "ewc_loss": 1.6391277313232422e-07, + "grad_norm": 4.568431854248047, + "learning_rate": 5.966325036603222e-08, + "loss": 1.4023, + "mean_token_accuracy": 0.6264370083808899, + "num_tokens": 4292268.0, + "step": 164 + }, + { + "epoch": 0.018119920931254117, + "ewc_loss": 1.6577541828155518e-07, + "grad_norm": 4.976563930511475, + "learning_rate": 6.002928257686676e-08, + "loss": 1.4688, + "mean_token_accuracy": 0.6155873537063599, + "num_tokens": 4317038.0, + "step": 165 + }, + { + "epoch": 0.01822973863386778, + "ewc_loss": 1.6670674085617065e-07, + "grad_norm": 4.773156642913818, + "learning_rate": 6.039531478770131e-08, + "loss": 1.4727, + "mean_token_accuracy": 0.6053148508071899, + "num_tokens": 4344439.0, + "step": 166 + }, + { + "epoch": 0.01833955633648144, + "ewc_loss": 1.6763806343078613e-07, + "grad_norm": 4.70591926574707, + "learning_rate": 6.076134699853588e-08, + "loss": 1.3809, + "mean_token_accuracy": 0.6276069283485413, + "num_tokens": 4371828.0, + "step": 167 + }, + { + "epoch": 0.018449374039095103, + "ewc_loss": 1.695007085800171e-07, + "grad_norm": 5.240204811096191, + "learning_rate": 6.112737920937042e-08, + "loss": 1.3, + "mean_token_accuracy": 0.6411614418029785, + "num_tokens": 4393611.0, + "step": 168 + }, + { + "epoch": 0.018559191741708763, + "ewc_loss": 1.73225998878479e-07, + "grad_norm": 4.78212308883667, + "learning_rate": 6.149341142020497e-08, + "loss": 1.3804, + "mean_token_accuracy": 0.6270914077758789, + "num_tokens": 4419458.0, + "step": 169 + }, + { + "epoch": 0.018669009444322426, + "ewc_loss": 1.73225998878479e-07, + "grad_norm": 4.731996059417725, + "learning_rate": 6.185944363103954e-08, + "loss": 1.3339, + "mean_token_accuracy": 0.641382098197937, + "num_tokens": 4444110.0, + "step": 170 + }, + { + "epoch": 0.018778827146936086, + "ewc_loss": 1.7695128917694092e-07, + "grad_norm": 5.359416484832764, + "learning_rate": 6.222547584187409e-08, + "loss": 1.3703, + "mean_token_accuracy": 0.6262195110321045, + "num_tokens": 4465140.0, + "step": 171 + }, + { + "epoch": 0.018888644849549746, + "ewc_loss": 1.7881393432617188e-07, + "grad_norm": 4.878367900848389, + "learning_rate": 6.259150805270863e-08, + "loss": 1.3916, + "mean_token_accuracy": 0.6225416660308838, + "num_tokens": 4489605.0, + "step": 172 + }, + { + "epoch": 0.01899846255216341, + "ewc_loss": 1.8067657947540283e-07, + "grad_norm": 4.591178894042969, + "learning_rate": 6.29575402635432e-08, + "loss": 1.3396, + "mean_token_accuracy": 0.6355042457580566, + "num_tokens": 4515095.0, + "step": 173 + }, + { + "epoch": 0.01910828025477707, + "ewc_loss": 1.825392246246338e-07, + "grad_norm": 4.502574443817139, + "learning_rate": 6.332357247437773e-08, + "loss": 1.3736, + "mean_token_accuracy": 0.6240534782409668, + "num_tokens": 4541160.0, + "step": 174 + }, + { + "epoch": 0.019218097957390733, + "ewc_loss": 1.862645149230957e-07, + "grad_norm": 5.158288955688477, + "learning_rate": 6.36896046852123e-08, + "loss": 1.3612, + "mean_token_accuracy": 0.6272565126419067, + "num_tokens": 4563441.0, + "step": 175 + }, + { + "epoch": 0.019327915660004392, + "ewc_loss": 1.862645149230957e-07, + "grad_norm": 4.724072456359863, + "learning_rate": 6.405563689604684e-08, + "loss": 1.3093, + "mean_token_accuracy": 0.6393072605133057, + "num_tokens": 4586881.0, + "step": 176 + }, + { + "epoch": 0.019437733362618052, + "ewc_loss": 1.8812716007232666e-07, + "grad_norm": 4.32861852645874, + "learning_rate": 6.44216691068814e-08, + "loss": 1.3926, + "mean_token_accuracy": 0.6223559379577637, + "num_tokens": 4616811.0, + "step": 177 + }, + { + "epoch": 0.019547551065231716, + "ewc_loss": 1.8998980522155762e-07, + "grad_norm": 4.770681858062744, + "learning_rate": 6.478770131771596e-08, + "loss": 1.2345, + "mean_token_accuracy": 0.6580879092216492, + "num_tokens": 4639573.0, + "step": 178 + }, + { + "epoch": 0.019657368767845376, + "ewc_loss": 1.9371509552001953e-07, + "grad_norm": 4.6031575202941895, + "learning_rate": 6.515373352855052e-08, + "loss": 1.4218, + "mean_token_accuracy": 0.6183050274848938, + "num_tokens": 4666392.0, + "step": 179 + }, + { + "epoch": 0.01976718647045904, + "ewc_loss": 1.9837170839309692e-07, + "grad_norm": 4.773738861083984, + "learning_rate": 6.551976573938505e-08, + "loss": 1.3204, + "mean_token_accuracy": 0.6362520456314087, + "num_tokens": 4690288.0, + "step": 180 + }, + { + "epoch": 0.0198770041730727, + "ewc_loss": 2.0023435354232788e-07, + "grad_norm": 4.466789722442627, + "learning_rate": 6.588579795021962e-08, + "loss": 1.3599, + "mean_token_accuracy": 0.6247494220733643, + "num_tokens": 4716583.0, + "step": 181 + }, + { + "epoch": 0.019986821875686362, + "ewc_loss": 2.0116567611694336e-07, + "grad_norm": 4.734434127807617, + "learning_rate": 6.625183016105416e-08, + "loss": 1.2912, + "mean_token_accuracy": 0.6482353210449219, + "num_tokens": 4741315.0, + "step": 182 + }, + { + "epoch": 0.020096639578300022, + "ewc_loss": 2.0582228899002075e-07, + "grad_norm": 4.259112358093262, + "learning_rate": 6.661786237188873e-08, + "loss": 1.4531, + "mean_token_accuracy": 0.6122480630874634, + "num_tokens": 4771862.0, + "step": 183 + }, + { + "epoch": 0.020206457280913682, + "ewc_loss": 2.0954757928848267e-07, + "grad_norm": 4.366380214691162, + "learning_rate": 6.698389458272328e-08, + "loss": 1.4023, + "mean_token_accuracy": 0.6230812668800354, + "num_tokens": 4798609.0, + "step": 184 + }, + { + "epoch": 0.020316274983527345, + "ewc_loss": 2.1047890186309814e-07, + "grad_norm": 4.620734214782715, + "learning_rate": 6.734992679355783e-08, + "loss": 1.3438, + "mean_token_accuracy": 0.6308112740516663, + "num_tokens": 4821516.0, + "step": 185 + }, + { + "epoch": 0.020426092686141005, + "ewc_loss": 2.123415470123291e-07, + "grad_norm": 4.519257068634033, + "learning_rate": 6.771595900439237e-08, + "loss": 1.3243, + "mean_token_accuracy": 0.6420289278030396, + "num_tokens": 4845487.0, + "step": 186 + }, + { + "epoch": 0.02053591038875467, + "ewc_loss": 2.1327286958694458e-07, + "grad_norm": 4.736703395843506, + "learning_rate": 6.808199121522694e-08, + "loss": 1.3907, + "mean_token_accuracy": 0.6292940974235535, + "num_tokens": 4868981.0, + "step": 187 + }, + { + "epoch": 0.020645728091368328, + "ewc_loss": 2.1327286958694458e-07, + "grad_norm": 4.307460308074951, + "learning_rate": 6.844802342606149e-08, + "loss": 1.3648, + "mean_token_accuracy": 0.6240247488021851, + "num_tokens": 4895095.0, + "step": 188 + }, + { + "epoch": 0.02075554579398199, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 7.856163024902344, + "learning_rate": 6.881405563689605e-08, + "loss": 1.429, + "mean_token_accuracy": 0.6133077144622803, + "num_tokens": 4920215.0, + "step": 189 + }, + { + "epoch": 0.02086536349659565, + "ewc_loss": 2.1792948246002197e-07, + "grad_norm": 4.027378082275391, + "learning_rate": 6.91800878477306e-08, + "loss": 1.3928, + "mean_token_accuracy": 0.6196175813674927, + "num_tokens": 4950113.0, + "step": 190 + }, + { + "epoch": 0.02097518119920931, + "ewc_loss": 2.207234501838684e-07, + "grad_norm": 5.055276870727539, + "learning_rate": 6.954612005856515e-08, + "loss": 1.3922, + "mean_token_accuracy": 0.6193819046020508, + "num_tokens": 4970901.0, + "step": 191 + }, + { + "epoch": 0.021084998901822975, + "ewc_loss": 2.2258609533309937e-07, + "grad_norm": 4.340865612030029, + "learning_rate": 6.99121522693997e-08, + "loss": 1.3307, + "mean_token_accuracy": 0.6374422907829285, + "num_tokens": 4996117.0, + "step": 192 + }, + { + "epoch": 0.021194816604436634, + "ewc_loss": 2.253800630569458e-07, + "grad_norm": 4.234145164489746, + "learning_rate": 7.027818448023426e-08, + "loss": 1.3658, + "mean_token_accuracy": 0.6265680193901062, + "num_tokens": 5020994.0, + "step": 193 + }, + { + "epoch": 0.021304634307050298, + "ewc_loss": 2.2724270820617676e-07, + "grad_norm": 4.040282249450684, + "learning_rate": 7.064421669106881e-08, + "loss": 1.4234, + "mean_token_accuracy": 0.6184403896331787, + "num_tokens": 5053438.0, + "step": 194 + }, + { + "epoch": 0.021414452009663958, + "ewc_loss": 2.2910535335540771e-07, + "grad_norm": 4.627737998962402, + "learning_rate": 7.101024890190337e-08, + "loss": 1.4063, + "mean_token_accuracy": 0.6208810806274414, + "num_tokens": 5077854.0, + "step": 195 + }, + { + "epoch": 0.021524269712277617, + "ewc_loss": 2.3189932107925415e-07, + "grad_norm": 4.470465660095215, + "learning_rate": 7.137628111273792e-08, + "loss": 1.4023, + "mean_token_accuracy": 0.6136541962623596, + "num_tokens": 5103447.0, + "step": 196 + }, + { + "epoch": 0.02163408741489128, + "ewc_loss": 2.3469328880310059e-07, + "grad_norm": 4.450072288513184, + "learning_rate": 7.174231332357247e-08, + "loss": 1.4091, + "mean_token_accuracy": 0.614383339881897, + "num_tokens": 5129474.0, + "step": 197 + }, + { + "epoch": 0.02174390511750494, + "ewc_loss": 2.3655593395233154e-07, + "grad_norm": 4.681524276733398, + "learning_rate": 7.210834553440702e-08, + "loss": 1.3746, + "mean_token_accuracy": 0.6248411536216736, + "num_tokens": 5154331.0, + "step": 198 + }, + { + "epoch": 0.021853722820118604, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 3.902874708175659, + "learning_rate": 7.247437774524158e-08, + "loss": 1.3692, + "mean_token_accuracy": 0.6305184960365295, + "num_tokens": 5182997.0, + "step": 199 + }, + { + "epoch": 0.021963540522732264, + "ewc_loss": 2.4028122425079346e-07, + "grad_norm": 3.8041622638702393, + "learning_rate": 7.284040995607613e-08, + "loss": 1.3059, + "mean_token_accuracy": 0.663203239440918, + "num_tokens": 5209941.0, + "step": 200 + }, + { + "epoch": 0.022073358225345927, + "ewc_loss": 2.4028122425079346e-07, + "grad_norm": 4.284565448760986, + "learning_rate": 7.320644216691069e-08, + "loss": 1.2456, + "mean_token_accuracy": 0.6567074656486511, + "num_tokens": 5235749.0, + "step": 201 + }, + { + "epoch": 0.022183175927959587, + "ewc_loss": 2.421438694000244e-07, + "grad_norm": 4.3639960289001465, + "learning_rate": 7.357247437774524e-08, + "loss": 1.3201, + "mean_token_accuracy": 0.6424267292022705, + "num_tokens": 5259439.0, + "step": 202 + }, + { + "epoch": 0.022292993630573247, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.544042587280273, + "learning_rate": 7.393850658857979e-08, + "loss": 1.3325, + "mean_token_accuracy": 0.6312687993049622, + "num_tokens": 5283571.0, + "step": 203 + }, + { + "epoch": 0.02240281133318691, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 4.503549098968506, + "learning_rate": 7.430453879941434e-08, + "loss": 1.264, + "mean_token_accuracy": 0.6514296531677246, + "num_tokens": 5306606.0, + "step": 204 + }, + { + "epoch": 0.02251262903580057, + "ewc_loss": 2.477318048477173e-07, + "grad_norm": 4.011395454406738, + "learning_rate": 7.46705710102489e-08, + "loss": 1.3617, + "mean_token_accuracy": 0.6394579410552979, + "num_tokens": 5333475.0, + "step": 205 + }, + { + "epoch": 0.022622446738414233, + "ewc_loss": 2.4959444999694824e-07, + "grad_norm": 4.019824504852295, + "learning_rate": 7.503660322108345e-08, + "loss": 1.409, + "mean_token_accuracy": 0.6150304079055786, + "num_tokens": 5360729.0, + "step": 206 + }, + { + "epoch": 0.022732264441027893, + "ewc_loss": 2.5331974029541016e-07, + "grad_norm": 4.793363094329834, + "learning_rate": 7.540263543191801e-08, + "loss": 1.2984, + "mean_token_accuracy": 0.6482268571853638, + "num_tokens": 5384083.0, + "step": 207 + }, + { + "epoch": 0.022842082143641557, + "ewc_loss": 2.5704503059387207e-07, + "grad_norm": 4.200797080993652, + "learning_rate": 7.576866764275256e-08, + "loss": 1.3166, + "mean_token_accuracy": 0.6360176205635071, + "num_tokens": 5407198.0, + "step": 208 + }, + { + "epoch": 0.022951899846255217, + "ewc_loss": 2.8870999813079834e-07, + "grad_norm": 7.623700141906738, + "learning_rate": 7.613469985358711e-08, + "loss": 1.3212, + "mean_token_accuracy": 0.6436549425125122, + "num_tokens": 5433236.0, + "step": 209 + }, + { + "epoch": 0.023061717548868876, + "ewc_loss": 2.644956111907959e-07, + "grad_norm": 4.554167747497559, + "learning_rate": 7.650073206442166e-08, + "loss": 1.382, + "mean_token_accuracy": 0.6287950873374939, + "num_tokens": 5457306.0, + "step": 210 + }, + { + "epoch": 0.02317153525148254, + "ewc_loss": 2.7008354663848877e-07, + "grad_norm": 4.318471431732178, + "learning_rate": 7.686676427525622e-08, + "loss": 1.3072, + "mean_token_accuracy": 0.641238808631897, + "num_tokens": 5477673.0, + "step": 211 + }, + { + "epoch": 0.0232813529540962, + "ewc_loss": 2.7567148208618164e-07, + "grad_norm": 4.027518272399902, + "learning_rate": 7.723279648609077e-08, + "loss": 1.3752, + "mean_token_accuracy": 0.6291112899780273, + "num_tokens": 5506648.0, + "step": 212 + }, + { + "epoch": 0.023391170656709863, + "ewc_loss": 2.7939677238464355e-07, + "grad_norm": 3.6263163089752197, + "learning_rate": 7.759882869692533e-08, + "loss": 1.4407, + "mean_token_accuracy": 0.6065710783004761, + "num_tokens": 5537209.0, + "step": 213 + }, + { + "epoch": 0.023500988359323523, + "ewc_loss": 2.7939677238464355e-07, + "grad_norm": 5.273519992828369, + "learning_rate": 7.796486090775987e-08, + "loss": 1.3883, + "mean_token_accuracy": 0.6326109766960144, + "num_tokens": 5559612.0, + "step": 214 + }, + { + "epoch": 0.023610806061937183, + "ewc_loss": 2.812594175338745e-07, + "grad_norm": 4.718125820159912, + "learning_rate": 7.833089311859443e-08, + "loss": 1.2989, + "mean_token_accuracy": 0.649314820766449, + "num_tokens": 5581853.0, + "step": 215 + }, + { + "epoch": 0.023720623764550846, + "ewc_loss": 2.8870999813079834e-07, + "grad_norm": 3.957597255706787, + "learning_rate": 7.869692532942898e-08, + "loss": 1.3115, + "mean_token_accuracy": 0.6419165730476379, + "num_tokens": 5605429.0, + "step": 216 + }, + { + "epoch": 0.023830441467164506, + "ewc_loss": 2.942979335784912e-07, + "grad_norm": 3.8119733333587646, + "learning_rate": 7.906295754026354e-08, + "loss": 1.2849, + "mean_token_accuracy": 0.6429544687271118, + "num_tokens": 5635630.0, + "step": 217 + }, + { + "epoch": 0.02394025916977817, + "ewc_loss": 2.9616057872772217e-07, + "grad_norm": 4.907812595367432, + "learning_rate": 7.942898975109809e-08, + "loss": 1.3939, + "mean_token_accuracy": 0.6189452409744263, + "num_tokens": 5661702.0, + "step": 218 + }, + { + "epoch": 0.02405007687239183, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 4.631181716918945, + "learning_rate": 7.979502196193265e-08, + "loss": 1.214, + "mean_token_accuracy": 0.6563711166381836, + "num_tokens": 5682274.0, + "step": 219 + }, + { + "epoch": 0.024159894575005492, + "ewc_loss": 3.03611159324646e-07, + "grad_norm": 3.547086477279663, + "learning_rate": 8.016105417276719e-08, + "loss": 1.3115, + "mean_token_accuracy": 0.6433394551277161, + "num_tokens": 5715819.0, + "step": 220 + }, + { + "epoch": 0.024269712277619152, + "ewc_loss": 3.0547380447387695e-07, + "grad_norm": 4.330469131469727, + "learning_rate": 8.052708638360175e-08, + "loss": 1.3454, + "mean_token_accuracy": 0.6319442987442017, + "num_tokens": 5740534.0, + "step": 221 + }, + { + "epoch": 0.024379529980232812, + "ewc_loss": 3.0919909477233887e-07, + "grad_norm": 6.164332389831543, + "learning_rate": 8.08931185944363e-08, + "loss": 1.3909, + "mean_token_accuracy": 0.6218575239181519, + "num_tokens": 5762357.0, + "step": 222 + }, + { + "epoch": 0.024489347682846475, + "ewc_loss": 3.129243850708008e-07, + "grad_norm": 5.290756702423096, + "learning_rate": 8.125915080527086e-08, + "loss": 1.2177, + "mean_token_accuracy": 0.6534779071807861, + "num_tokens": 5787211.0, + "step": 223 + }, + { + "epoch": 0.024599165385460135, + "ewc_loss": 3.166496753692627e-07, + "grad_norm": 4.1256866455078125, + "learning_rate": 8.162518301610541e-08, + "loss": 1.3448, + "mean_token_accuracy": 0.6399838328361511, + "num_tokens": 5820989.0, + "step": 224 + }, + { + "epoch": 0.0247089830880738, + "ewc_loss": 3.241002559661865e-07, + "grad_norm": 5.22112512588501, + "learning_rate": 8.199121522693997e-08, + "loss": 1.2979, + "mean_token_accuracy": 0.6357036828994751, + "num_tokens": 5844903.0, + "step": 225 + }, + { + "epoch": 0.02481880079068746, + "ewc_loss": 3.203749656677246e-07, + "grad_norm": 5.260519981384277, + "learning_rate": 8.235724743777451e-08, + "loss": 1.2982, + "mean_token_accuracy": 0.6445165276527405, + "num_tokens": 5867365.0, + "step": 226 + }, + { + "epoch": 0.024928618493301122, + "ewc_loss": 3.2223761081695557e-07, + "grad_norm": 4.130448818206787, + "learning_rate": 8.272327964860907e-08, + "loss": 1.3746, + "mean_token_accuracy": 0.6221277713775635, + "num_tokens": 5895675.0, + "step": 227 + }, + { + "epoch": 0.02503843619591478, + "ewc_loss": 3.2782554626464844e-07, + "grad_norm": 4.34597635269165, + "learning_rate": 8.308931185944362e-08, + "loss": 1.3302, + "mean_token_accuracy": 0.6392707824707031, + "num_tokens": 5920138.0, + "step": 228 + }, + { + "epoch": 0.02514825389852844, + "ewc_loss": 3.427267074584961e-07, + "grad_norm": 5.976410388946533, + "learning_rate": 8.345534407027818e-08, + "loss": 1.3085, + "mean_token_accuracy": 0.6343300938606262, + "num_tokens": 5945763.0, + "step": 229 + }, + { + "epoch": 0.025258071601142105, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 5.081137180328369, + "learning_rate": 8.382137628111273e-08, + "loss": 1.2255, + "mean_token_accuracy": 0.662278413772583, + "num_tokens": 5971811.0, + "step": 230 + }, + { + "epoch": 0.025367889303755765, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 5.385824680328369, + "learning_rate": 8.41874084919473e-08, + "loss": 1.3604, + "mean_token_accuracy": 0.6327381730079651, + "num_tokens": 5995852.0, + "step": 231 + }, + { + "epoch": 0.025477707006369428, + "ewc_loss": 3.3155083656311035e-07, + "grad_norm": 5.299428462982178, + "learning_rate": 8.455344070278183e-08, + "loss": 1.2903, + "mean_token_accuracy": 0.6424348950386047, + "num_tokens": 6018357.0, + "step": 232 + }, + { + "epoch": 0.025587524708983088, + "ewc_loss": 3.3527612686157227e-07, + "grad_norm": 4.244760990142822, + "learning_rate": 8.491947291361639e-08, + "loss": 1.3392, + "mean_token_accuracy": 0.6361217498779297, + "num_tokens": 6046503.0, + "step": 233 + }, + { + "epoch": 0.025697342411596748, + "ewc_loss": 3.3527612686157227e-07, + "grad_norm": 5.19500732421875, + "learning_rate": 8.528550512445094e-08, + "loss": 1.2712, + "mean_token_accuracy": 0.6457244753837585, + "num_tokens": 6067210.0, + "step": 234 + }, + { + "epoch": 0.02580716011421041, + "ewc_loss": 3.371387720108032e-07, + "grad_norm": 4.664117813110352, + "learning_rate": 8.56515373352855e-08, + "loss": 1.2485, + "mean_token_accuracy": 0.6505007743835449, + "num_tokens": 6091649.0, + "step": 235 + }, + { + "epoch": 0.02591697781682407, + "ewc_loss": 3.371387720108032e-07, + "grad_norm": 5.433875560760498, + "learning_rate": 8.601756954612005e-08, + "loss": 1.3089, + "mean_token_accuracy": 0.6433416604995728, + "num_tokens": 6119052.0, + "step": 236 + }, + { + "epoch": 0.026026795519437734, + "ewc_loss": 3.390014171600342e-07, + "grad_norm": 5.2787041664123535, + "learning_rate": 8.638360175695462e-08, + "loss": 1.2517, + "mean_token_accuracy": 0.65724778175354, + "num_tokens": 6141688.0, + "step": 237 + }, + { + "epoch": 0.026136613222051394, + "ewc_loss": 3.427267074584961e-07, + "grad_norm": 4.264070987701416, + "learning_rate": 8.674963396778915e-08, + "loss": 1.3312, + "mean_token_accuracy": 0.6298710107803345, + "num_tokens": 6168591.0, + "step": 238 + }, + { + "epoch": 0.026246430924665057, + "ewc_loss": 3.501772880554199e-07, + "grad_norm": 5.528504848480225, + "learning_rate": 8.711566617862371e-08, + "loss": 1.2045, + "mean_token_accuracy": 0.6594719886779785, + "num_tokens": 6186942.0, + "step": 239 + }, + { + "epoch": 0.026356248627278717, + "ewc_loss": 3.5762786865234375e-07, + "grad_norm": 4.858494281768799, + "learning_rate": 8.748169838945826e-08, + "loss": 1.2996, + "mean_token_accuracy": 0.6410405039787292, + "num_tokens": 6214324.0, + "step": 240 + }, + { + "epoch": 0.026466066329892377, + "ewc_loss": 3.5390257835388184e-07, + "grad_norm": 4.709834575653076, + "learning_rate": 8.784773060029283e-08, + "loss": 1.3172, + "mean_token_accuracy": 0.6369020342826843, + "num_tokens": 6240342.0, + "step": 241 + }, + { + "epoch": 0.02657588403250604, + "ewc_loss": 3.557652235031128e-07, + "grad_norm": 5.937821388244629, + "learning_rate": 8.821376281112737e-08, + "loss": 1.3686, + "mean_token_accuracy": 0.6253384351730347, + "num_tokens": 6266070.0, + "step": 242 + }, + { + "epoch": 0.0266857017351197, + "ewc_loss": 3.6135315895080566e-07, + "grad_norm": 4.688621520996094, + "learning_rate": 8.857979502196194e-08, + "loss": 1.3203, + "mean_token_accuracy": 0.6368655562400818, + "num_tokens": 6294087.0, + "step": 243 + }, + { + "epoch": 0.026795519437733364, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 4.985023021697998, + "learning_rate": 8.894582723279647e-08, + "loss": 1.2854, + "mean_token_accuracy": 0.6403250098228455, + "num_tokens": 6318429.0, + "step": 244 + }, + { + "epoch": 0.026905337140347024, + "ewc_loss": 3.6135315895080566e-07, + "grad_norm": 4.652887344360352, + "learning_rate": 8.931185944363104e-08, + "loss": 1.3172, + "mean_token_accuracy": 0.6410970687866211, + "num_tokens": 6348101.0, + "step": 245 + }, + { + "epoch": 0.027015154842960687, + "ewc_loss": 3.594905138015747e-07, + "grad_norm": 5.14209508895874, + "learning_rate": 8.967789165446558e-08, + "loss": 1.2921, + "mean_token_accuracy": 0.6468883752822876, + "num_tokens": 6373529.0, + "step": 246 + }, + { + "epoch": 0.027124972545574347, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 5.133951187133789, + "learning_rate": 9.004392386530015e-08, + "loss": 1.3998, + "mean_token_accuracy": 0.6162773370742798, + "num_tokens": 6400776.0, + "step": 247 + }, + { + "epoch": 0.027234790248188007, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 5.355627536773682, + "learning_rate": 9.04099560761347e-08, + "loss": 1.2663, + "mean_token_accuracy": 0.6407431960105896, + "num_tokens": 6425702.0, + "step": 248 + }, + { + "epoch": 0.02734460795080167, + "ewc_loss": 3.688037395477295e-07, + "grad_norm": 5.053912162780762, + "learning_rate": 9.077598828696926e-08, + "loss": 1.2714, + "mean_token_accuracy": 0.6495877504348755, + "num_tokens": 6447754.0, + "step": 249 + }, + { + "epoch": 0.02745442565341533, + "ewc_loss": 3.762543201446533e-07, + "grad_norm": 4.545516490936279, + "learning_rate": 9.11420204978038e-08, + "loss": 1.2162, + "mean_token_accuracy": 0.6680011749267578, + "num_tokens": 6477535.0, + "step": 250 + }, + { + "epoch": 0.027564243356028993, + "ewc_loss": 3.781169652938843e-07, + "grad_norm": 5.497277736663818, + "learning_rate": 9.150805270863836e-08, + "loss": 1.2577, + "mean_token_accuracy": 0.6526871919631958, + "num_tokens": 6504584.0, + "step": 251 + }, + { + "epoch": 0.027674061058642653, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 5.595630645751953, + "learning_rate": 9.18740849194729e-08, + "loss": 1.1519, + "mean_token_accuracy": 0.679226815700531, + "num_tokens": 6529726.0, + "step": 252 + }, + { + "epoch": 0.027783878761256313, + "ewc_loss": 3.781169652938843e-07, + "grad_norm": 7.366680145263672, + "learning_rate": 9.224011713030747e-08, + "loss": 1.3506, + "mean_token_accuracy": 0.633664608001709, + "num_tokens": 6550265.0, + "step": 253 + }, + { + "epoch": 0.027893696463869976, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 4.635837554931641, + "learning_rate": 9.260614934114202e-08, + "loss": 1.422, + "mean_token_accuracy": 0.6159522533416748, + "num_tokens": 6579733.0, + "step": 254 + }, + { + "epoch": 0.028003514166483636, + "ewc_loss": 3.818422555923462e-07, + "grad_norm": 4.2379326820373535, + "learning_rate": 9.297218155197657e-08, + "loss": 1.3105, + "mean_token_accuracy": 0.6411106586456299, + "num_tokens": 6613781.0, + "step": 255 + }, + { + "epoch": 0.0281133318690973, + "ewc_loss": 3.781169652938843e-07, + "grad_norm": 5.529631614685059, + "learning_rate": 9.333821376281111e-08, + "loss": 1.2227, + "mean_token_accuracy": 0.6493027210235596, + "num_tokens": 6635982.0, + "step": 256 + }, + { + "epoch": 0.02822314957171096, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 5.151161193847656, + "learning_rate": 9.370424597364568e-08, + "loss": 1.2878, + "mean_token_accuracy": 0.6368296146392822, + "num_tokens": 6660180.0, + "step": 257 + }, + { + "epoch": 0.028332967274324623, + "ewc_loss": 3.7997961044311523e-07, + "grad_norm": 5.496138095855713, + "learning_rate": 9.407027818448023e-08, + "loss": 1.2877, + "mean_token_accuracy": 0.6395713686943054, + "num_tokens": 6687798.0, + "step": 258 + }, + { + "epoch": 0.028442784976938282, + "ewc_loss": 3.8370490074157715e-07, + "grad_norm": 5.601419925689697, + "learning_rate": 9.443631039531479e-08, + "loss": 1.3125, + "mean_token_accuracy": 0.6410201787948608, + "num_tokens": 6716005.0, + "step": 259 + }, + { + "epoch": 0.028552602679551942, + "ewc_loss": 3.855675458908081e-07, + "grad_norm": 4.540834903717041, + "learning_rate": 9.480234260614934e-08, + "loss": 1.2921, + "mean_token_accuracy": 0.6424496173858643, + "num_tokens": 6744026.0, + "step": 260 + }, + { + "epoch": 0.028662420382165606, + "ewc_loss": 3.91155481338501e-07, + "grad_norm": 6.501614093780518, + "learning_rate": 9.516837481698389e-08, + "loss": 1.2615, + "mean_token_accuracy": 0.6433550119400024, + "num_tokens": 6765799.0, + "step": 261 + }, + { + "epoch": 0.028772238084779266, + "ewc_loss": 3.9674341678619385e-07, + "grad_norm": 5.347564220428467, + "learning_rate": 9.553440702781844e-08, + "loss": 1.3264, + "mean_token_accuracy": 0.6290442943572998, + "num_tokens": 6792636.0, + "step": 262 + }, + { + "epoch": 0.02888205578739293, + "ewc_loss": 4.2282044887542725e-07, + "grad_norm": 9.902229309082031, + "learning_rate": 9.5900439238653e-08, + "loss": 1.1282, + "mean_token_accuracy": 0.6817407608032227, + "num_tokens": 6814122.0, + "step": 263 + }, + { + "epoch": 0.02899187349000659, + "ewc_loss": 3.9674341678619385e-07, + "grad_norm": 5.310556411743164, + "learning_rate": 9.626647144948755e-08, + "loss": 1.3393, + "mean_token_accuracy": 0.6283395290374756, + "num_tokens": 6844162.0, + "step": 264 + }, + { + "epoch": 0.029101691192620252, + "ewc_loss": 3.986060619354248e-07, + "grad_norm": 6.092047214508057, + "learning_rate": 9.663250366032211e-08, + "loss": 1.1351, + "mean_token_accuracy": 0.6773399114608765, + "num_tokens": 6867856.0, + "step": 265 + }, + { + "epoch": 0.029211508895233912, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 6.305900573730469, + "learning_rate": 9.699853587115666e-08, + "loss": 1.2363, + "mean_token_accuracy": 0.6626016497612, + "num_tokens": 6894300.0, + "step": 266 + }, + { + "epoch": 0.029321326597847572, + "ewc_loss": 4.0605664253234863e-07, + "grad_norm": 5.595137119293213, + "learning_rate": 9.736456808199121e-08, + "loss": 1.2512, + "mean_token_accuracy": 0.6507871150970459, + "num_tokens": 6919880.0, + "step": 267 + }, + { + "epoch": 0.029431144300461235, + "ewc_loss": 4.079192876815796e-07, + "grad_norm": 6.571917533874512, + "learning_rate": 9.773060029282576e-08, + "loss": 1.295, + "mean_token_accuracy": 0.6457264423370361, + "num_tokens": 6948610.0, + "step": 268 + }, + { + "epoch": 0.029540962003074895, + "ewc_loss": 4.153698682785034e-07, + "grad_norm": 5.103121757507324, + "learning_rate": 9.809663250366032e-08, + "loss": 1.2734, + "mean_token_accuracy": 0.6428097486495972, + "num_tokens": 6974058.0, + "step": 269 + }, + { + "epoch": 0.02965077970568856, + "ewc_loss": 4.1350722312927246e-07, + "grad_norm": 6.874606132507324, + "learning_rate": 9.846266471449487e-08, + "loss": 1.3268, + "mean_token_accuracy": 0.6337960362434387, + "num_tokens": 7001527.0, + "step": 270 + }, + { + "epoch": 0.029760597408302218, + "ewc_loss": 4.172325134277344e-07, + "grad_norm": 6.270367622375488, + "learning_rate": 9.882869692532943e-08, + "loss": 1.3023, + "mean_token_accuracy": 0.6330515146255493, + "num_tokens": 7025908.0, + "step": 271 + }, + { + "epoch": 0.029870415110915878, + "ewc_loss": 4.172325134277344e-07, + "grad_norm": 7.66644287109375, + "learning_rate": 9.919472913616398e-08, + "loss": 1.2006, + "mean_token_accuracy": 0.6575584411621094, + "num_tokens": 7048417.0, + "step": 272 + }, + { + "epoch": 0.02998023281352954, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 6.719223976135254, + "learning_rate": 9.956076134699853e-08, + "loss": 1.2363, + "mean_token_accuracy": 0.6523250341415405, + "num_tokens": 7072239.0, + "step": 273 + }, + { + "epoch": 0.0300900505161432, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 5.16672420501709, + "learning_rate": 9.992679355783308e-08, + "loss": 1.2991, + "mean_token_accuracy": 0.6390918493270874, + "num_tokens": 7097960.0, + "step": 274 + }, + { + "epoch": 0.030199868218756865, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 5.965039253234863, + "learning_rate": 1.0029282576866764e-07, + "loss": 1.2792, + "mean_token_accuracy": 0.6485739946365356, + "num_tokens": 7129347.0, + "step": 275 + }, + { + "epoch": 0.030309685921370524, + "ewc_loss": 4.2282044887542725e-07, + "grad_norm": 5.943881988525391, + "learning_rate": 1.0065885797950219e-07, + "loss": 1.2707, + "mean_token_accuracy": 0.6432873606681824, + "num_tokens": 7155811.0, + "step": 276 + }, + { + "epoch": 0.030419503623984188, + "ewc_loss": 4.284083843231201e-07, + "grad_norm": 6.362922668457031, + "learning_rate": 1.0102489019033675e-07, + "loss": 1.1627, + "mean_token_accuracy": 0.6727200746536255, + "num_tokens": 7175864.0, + "step": 277 + }, + { + "epoch": 0.030529321326597848, + "ewc_loss": 4.284083843231201e-07, + "grad_norm": 5.9555583000183105, + "learning_rate": 1.013909224011713e-07, + "loss": 1.192, + "mean_token_accuracy": 0.6594157218933105, + "num_tokens": 7197905.0, + "step": 278 + }, + { + "epoch": 0.030639139029211507, + "ewc_loss": 4.284083843231201e-07, + "grad_norm": 5.879563808441162, + "learning_rate": 1.0175695461200585e-07, + "loss": 1.2778, + "mean_token_accuracy": 0.6454423069953918, + "num_tokens": 7223710.0, + "step": 279 + }, + { + "epoch": 0.03074895673182517, + "ewc_loss": 4.284083843231201e-07, + "grad_norm": 8.170295715332031, + "learning_rate": 1.021229868228404e-07, + "loss": 1.2321, + "mean_token_accuracy": 0.6535826921463013, + "num_tokens": 7245870.0, + "step": 280 + }, + { + "epoch": 0.03085877443443883, + "ewc_loss": 4.302710294723511e-07, + "grad_norm": 5.6173906326293945, + "learning_rate": 1.0248901903367496e-07, + "loss": 1.2697, + "mean_token_accuracy": 0.6463347673416138, + "num_tokens": 7271138.0, + "step": 281 + }, + { + "epoch": 0.030968592137052494, + "ewc_loss": 4.3213367462158203e-07, + "grad_norm": 5.415966987609863, + "learning_rate": 1.0285505124450951e-07, + "loss": 1.2869, + "mean_token_accuracy": 0.6486685276031494, + "num_tokens": 7297052.0, + "step": 282 + }, + { + "epoch": 0.031078409839666154, + "ewc_loss": 4.33996319770813e-07, + "grad_norm": 5.964801788330078, + "learning_rate": 1.0322108345534407e-07, + "loss": 1.3544, + "mean_token_accuracy": 0.6286612749099731, + "num_tokens": 7326733.0, + "step": 283 + }, + { + "epoch": 0.031188227542279817, + "ewc_loss": 4.377216100692749e-07, + "grad_norm": 6.092957019805908, + "learning_rate": 1.0358711566617862e-07, + "loss": 1.3009, + "mean_token_accuracy": 0.6364785432815552, + "num_tokens": 7358626.0, + "step": 284 + }, + { + "epoch": 0.03129804524489348, + "ewc_loss": 4.470348358154297e-07, + "grad_norm": 6.097567081451416, + "learning_rate": 1.0395314787701317e-07, + "loss": 1.3013, + "mean_token_accuracy": 0.6372898817062378, + "num_tokens": 7385058.0, + "step": 285 + }, + { + "epoch": 0.03140786294750714, + "ewc_loss": 4.470348358154297e-07, + "grad_norm": 4.687029838562012, + "learning_rate": 1.0431918008784772e-07, + "loss": 1.2568, + "mean_token_accuracy": 0.6519918441772461, + "num_tokens": 7414433.0, + "step": 286 + }, + { + "epoch": 0.0315176806501208, + "ewc_loss": 4.5262277126312256e-07, + "grad_norm": 4.509022235870361, + "learning_rate": 1.0468521229868228e-07, + "loss": 1.3723, + "mean_token_accuracy": 0.619979977607727, + "num_tokens": 7446196.0, + "step": 287 + }, + { + "epoch": 0.031627498352734464, + "ewc_loss": 4.507601261138916e-07, + "grad_norm": 8.311691284179688, + "learning_rate": 1.0505124450951683e-07, + "loss": 1.2323, + "mean_token_accuracy": 0.6480994820594788, + "num_tokens": 7469003.0, + "step": 288 + }, + { + "epoch": 0.03173731605534812, + "ewc_loss": 4.470348358154297e-07, + "grad_norm": 7.136562824249268, + "learning_rate": 1.0541727672035139e-07, + "loss": 1.2958, + "mean_token_accuracy": 0.6405853033065796, + "num_tokens": 7494794.0, + "step": 289 + }, + { + "epoch": 0.03184713375796178, + "ewc_loss": 4.544854164123535e-07, + "grad_norm": 4.994431018829346, + "learning_rate": 1.0578330893118593e-07, + "loss": 1.2929, + "mean_token_accuracy": 0.6383920907974243, + "num_tokens": 7531151.0, + "step": 290 + }, + { + "epoch": 0.03195695146057544, + "ewc_loss": 4.5821070671081543e-07, + "grad_norm": 5.512518405914307, + "learning_rate": 1.0614934114202049e-07, + "loss": 1.2003, + "mean_token_accuracy": 0.6601732969284058, + "num_tokens": 7556362.0, + "step": 291 + }, + { + "epoch": 0.0320667691631891, + "ewc_loss": 4.5821070671081543e-07, + "grad_norm": 5.110725402832031, + "learning_rate": 1.0651537335285504e-07, + "loss": 1.2884, + "mean_token_accuracy": 0.6379632353782654, + "num_tokens": 7580209.0, + "step": 292 + }, + { + "epoch": 0.03217658686580277, + "ewc_loss": 4.637986421585083e-07, + "grad_norm": 5.802891731262207, + "learning_rate": 1.068814055636896e-07, + "loss": 1.3453, + "mean_token_accuracy": 0.627960205078125, + "num_tokens": 7607823.0, + "step": 293 + }, + { + "epoch": 0.03228640456841643, + "ewc_loss": 4.637986421585083e-07, + "grad_norm": 7.785724639892578, + "learning_rate": 1.0724743777452415e-07, + "loss": 1.3566, + "mean_token_accuracy": 0.6285929679870605, + "num_tokens": 7630083.0, + "step": 294 + }, + { + "epoch": 0.03239622227103009, + "ewc_loss": 4.731118679046631e-07, + "grad_norm": 4.477219581604004, + "learning_rate": 1.0761346998535871e-07, + "loss": 1.3449, + "mean_token_accuracy": 0.6320967674255371, + "num_tokens": 7657339.0, + "step": 295 + }, + { + "epoch": 0.03250603997364375, + "ewc_loss": 4.731118679046631e-07, + "grad_norm": 5.464284896850586, + "learning_rate": 1.0797950219619325e-07, + "loss": 1.1339, + "mean_token_accuracy": 0.6680145263671875, + "num_tokens": 7678789.0, + "step": 296 + }, + { + "epoch": 0.03261585767625741, + "ewc_loss": 4.731118679046631e-07, + "grad_norm": 5.749024868011475, + "learning_rate": 1.0834553440702781e-07, + "loss": 1.2146, + "mean_token_accuracy": 0.6632973551750183, + "num_tokens": 7705531.0, + "step": 297 + }, + { + "epoch": 0.032725675378871076, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 5.809525012969971, + "learning_rate": 1.0871156661786236e-07, + "loss": 1.2377, + "mean_token_accuracy": 0.6515379548072815, + "num_tokens": 7726907.0, + "step": 298 + }, + { + "epoch": 0.032835493081484736, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 5.119002819061279, + "learning_rate": 1.0907759882869692e-07, + "loss": 1.4048, + "mean_token_accuracy": 0.628668487071991, + "num_tokens": 7754917.0, + "step": 299 + }, + { + "epoch": 0.032945310784098396, + "ewc_loss": 4.7497451305389404e-07, + "grad_norm": 8.174702644348145, + "learning_rate": 1.0944363103953147e-07, + "loss": 1.259, + "mean_token_accuracy": 0.6429781317710876, + "num_tokens": 7780922.0, + "step": 300 + }, + { + "epoch": 0.033055128486712056, + "ewc_loss": 4.805624485015869e-07, + "grad_norm": 6.031988620758057, + "learning_rate": 1.0980966325036604e-07, + "loss": 1.2788, + "mean_token_accuracy": 0.6463908553123474, + "num_tokens": 7807004.0, + "step": 301 + }, + { + "epoch": 0.03316494618932572, + "ewc_loss": 4.805624485015869e-07, + "grad_norm": 7.054834365844727, + "learning_rate": 1.1017569546120057e-07, + "loss": 1.1587, + "mean_token_accuracy": 0.6666781306266785, + "num_tokens": 7830480.0, + "step": 302 + }, + { + "epoch": 0.03327476389193938, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 6.1473069190979, + "learning_rate": 1.1054172767203513e-07, + "loss": 1.2634, + "mean_token_accuracy": 0.6433677077293396, + "num_tokens": 7856537.0, + "step": 303 + }, + { + "epoch": 0.03338458159455304, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 7.349719524383545, + "learning_rate": 1.1090775988286968e-07, + "loss": 1.1919, + "mean_token_accuracy": 0.6625873446464539, + "num_tokens": 7875428.0, + "step": 304 + }, + { + "epoch": 0.0334943992971667, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 7.172534942626953, + "learning_rate": 1.1127379209370424e-07, + "loss": 1.2231, + "mean_token_accuracy": 0.6512633562088013, + "num_tokens": 7899765.0, + "step": 305 + }, + { + "epoch": 0.03360421699978036, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 5.867734432220459, + "learning_rate": 1.116398243045388e-07, + "loss": 1.2553, + "mean_token_accuracy": 0.6475018262863159, + "num_tokens": 7922002.0, + "step": 306 + }, + { + "epoch": 0.03371403470239403, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 6.806920051574707, + "learning_rate": 1.1200585651537336e-07, + "loss": 1.2632, + "mean_token_accuracy": 0.6510016918182373, + "num_tokens": 7945165.0, + "step": 307 + }, + { + "epoch": 0.03382385240500769, + "ewc_loss": 4.880130290985107e-07, + "grad_norm": 6.480072975158691, + "learning_rate": 1.1237188872620789e-07, + "loss": 1.1846, + "mean_token_accuracy": 0.6720231175422668, + "num_tokens": 7967060.0, + "step": 308 + }, + { + "epoch": 0.03393367010762135, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.317044258117676, + "learning_rate": 1.1273792093704245e-07, + "loss": 1.2047, + "mean_token_accuracy": 0.6551101207733154, + "num_tokens": 7988205.0, + "step": 309 + }, + { + "epoch": 0.03404348781023501, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.064119815826416, + "learning_rate": 1.13103953147877e-07, + "loss": 1.1469, + "mean_token_accuracy": 0.6654024720191956, + "num_tokens": 8018516.0, + "step": 310 + }, + { + "epoch": 0.03415330551284867, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.4969024658203125, + "learning_rate": 1.1346998535871157e-07, + "loss": 1.2439, + "mean_token_accuracy": 0.6552759408950806, + "num_tokens": 8049713.0, + "step": 311 + }, + { + "epoch": 0.034263123215462335, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.229680061340332, + "learning_rate": 1.1383601756954611e-07, + "loss": 1.2434, + "mean_token_accuracy": 0.6534615755081177, + "num_tokens": 8077271.0, + "step": 312 + }, + { + "epoch": 0.034372940918075995, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 6.123237609863281, + "learning_rate": 1.1420204978038068e-07, + "loss": 1.2514, + "mean_token_accuracy": 0.6559536457061768, + "num_tokens": 8100334.0, + "step": 313 + }, + { + "epoch": 0.034482758620689655, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.210810661315918, + "learning_rate": 1.1456808199121521e-07, + "loss": 1.2702, + "mean_token_accuracy": 0.6622012853622437, + "num_tokens": 8127291.0, + "step": 314 + }, + { + "epoch": 0.034592576323303315, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 4.417454242706299, + "learning_rate": 1.1493411420204978e-07, + "loss": 1.303, + "mean_token_accuracy": 0.6382927894592285, + "num_tokens": 8158273.0, + "step": 315 + }, + { + "epoch": 0.034702394025916974, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 6.96243953704834, + "learning_rate": 1.1530014641288432e-07, + "loss": 1.1701, + "mean_token_accuracy": 0.6634287238121033, + "num_tokens": 8180056.0, + "step": 316 + }, + { + "epoch": 0.03481221172853064, + "ewc_loss": 4.917383193969727e-07, + "grad_norm": 5.785488128662109, + "learning_rate": 1.1566617862371889e-07, + "loss": 1.2445, + "mean_token_accuracy": 0.6542810201644897, + "num_tokens": 8203267.0, + "step": 317 + }, + { + "epoch": 0.0349220294311443, + "ewc_loss": 4.991888999938965e-07, + "grad_norm": 6.138689041137695, + "learning_rate": 1.1603221083455344e-07, + "loss": 1.2353, + "mean_token_accuracy": 0.6557822227478027, + "num_tokens": 8225551.0, + "step": 318 + }, + { + "epoch": 0.03503184713375796, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 6.987654685974121, + "learning_rate": 1.16398243045388e-07, + "loss": 1.2865, + "mean_token_accuracy": 0.6338117122650146, + "num_tokens": 8249591.0, + "step": 319 + }, + { + "epoch": 0.03514166483637162, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 6.15392541885376, + "learning_rate": 1.1676427525622253e-07, + "loss": 1.255, + "mean_token_accuracy": 0.6432145833969116, + "num_tokens": 8274565.0, + "step": 320 + }, + { + "epoch": 0.03525148253898529, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 7.588017463684082, + "learning_rate": 1.171303074670571e-07, + "loss": 1.2746, + "mean_token_accuracy": 0.6421947479248047, + "num_tokens": 8301284.0, + "step": 321 + }, + { + "epoch": 0.03536130024159895, + "ewc_loss": 5.140900611877441e-07, + "grad_norm": 7.425168037414551, + "learning_rate": 1.1749633967789165e-07, + "loss": 1.247, + "mean_token_accuracy": 0.6451322436332703, + "num_tokens": 8322113.0, + "step": 322 + }, + { + "epoch": 0.03547111794421261, + "ewc_loss": 5.140900611877441e-07, + "grad_norm": 5.608783721923828, + "learning_rate": 1.1786237188872621e-07, + "loss": 1.2811, + "mean_token_accuracy": 0.637179434299469, + "num_tokens": 8349315.0, + "step": 323 + }, + { + "epoch": 0.03558093564682627, + "ewc_loss": 5.140900611877441e-07, + "grad_norm": 6.23908805847168, + "learning_rate": 1.1822840409956076e-07, + "loss": 1.2406, + "mean_token_accuracy": 0.6481513977050781, + "num_tokens": 8369953.0, + "step": 324 + }, + { + "epoch": 0.03569075334943993, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 5.236105442047119, + "learning_rate": 1.1859443631039532e-07, + "loss": 1.2718, + "mean_token_accuracy": 0.6450678706169128, + "num_tokens": 8394310.0, + "step": 325 + }, + { + "epoch": 0.035800571052053594, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 7.5495524406433105, + "learning_rate": 1.1896046852122986e-07, + "loss": 1.3231, + "mean_token_accuracy": 0.6311029195785522, + "num_tokens": 8419318.0, + "step": 326 + }, + { + "epoch": 0.035910388754667254, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 7.786355495452881, + "learning_rate": 1.1932650073206443e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.6789684295654297, + "num_tokens": 8443229.0, + "step": 327 + }, + { + "epoch": 0.036020206457280914, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 6.807726860046387, + "learning_rate": 1.1969253294289895e-07, + "loss": 1.2548, + "mean_token_accuracy": 0.6520034074783325, + "num_tokens": 8467929.0, + "step": 328 + }, + { + "epoch": 0.03613002415989457, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 7.45952033996582, + "learning_rate": 1.2005856515373353e-07, + "loss": 1.2184, + "mean_token_accuracy": 0.6555600762367249, + "num_tokens": 8491470.0, + "step": 329 + }, + { + "epoch": 0.03623984186250823, + "ewc_loss": 5.327165126800537e-07, + "grad_norm": 6.529544353485107, + "learning_rate": 1.2042459736456808e-07, + "loss": 1.1856, + "mean_token_accuracy": 0.6641278266906738, + "num_tokens": 8519715.0, + "step": 330 + }, + { + "epoch": 0.0363496595651219, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 8.304701805114746, + "learning_rate": 1.2079062957540263e-07, + "loss": 1.2453, + "mean_token_accuracy": 0.6475693583488464, + "num_tokens": 8542376.0, + "step": 331 + }, + { + "epoch": 0.03645947726773556, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 6.416407585144043, + "learning_rate": 1.2115666178623718e-07, + "loss": 1.2554, + "mean_token_accuracy": 0.6518071889877319, + "num_tokens": 8564513.0, + "step": 332 + }, + { + "epoch": 0.03656929497034922, + "ewc_loss": 5.364418029785156e-07, + "grad_norm": 5.606086254119873, + "learning_rate": 1.2152269399707175e-07, + "loss": 1.1767, + "mean_token_accuracy": 0.6683017611503601, + "num_tokens": 8587957.0, + "step": 333 + }, + { + "epoch": 0.03667911267296288, + "ewc_loss": 5.476176738739014e-07, + "grad_norm": 6.430941581726074, + "learning_rate": 1.2188872620790627e-07, + "loss": 1.1964, + "mean_token_accuracy": 0.6663935780525208, + "num_tokens": 8612155.0, + "step": 334 + }, + { + "epoch": 0.03678893037557654, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 6.6606855392456055, + "learning_rate": 1.2225475841874085e-07, + "loss": 1.2184, + "mean_token_accuracy": 0.6615048050880432, + "num_tokens": 8643677.0, + "step": 335 + }, + { + "epoch": 0.036898748078190206, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 6.01754093170166, + "learning_rate": 1.226207906295754e-07, + "loss": 1.1737, + "mean_token_accuracy": 0.6637842655181885, + "num_tokens": 8672599.0, + "step": 336 + }, + { + "epoch": 0.037008565780803866, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 4.625904083251953, + "learning_rate": 1.2298682284040995e-07, + "loss": 1.2656, + "mean_token_accuracy": 0.6554878950119019, + "num_tokens": 8696109.0, + "step": 337 + }, + { + "epoch": 0.037118383483417526, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 5.722536087036133, + "learning_rate": 1.233528550512445e-07, + "loss": 1.171, + "mean_token_accuracy": 0.6706241369247437, + "num_tokens": 8725617.0, + "step": 338 + }, + { + "epoch": 0.037228201186031186, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 4.233719348907471, + "learning_rate": 1.2371888726207907e-07, + "loss": 1.1814, + "mean_token_accuracy": 0.6652818918228149, + "num_tokens": 8753416.0, + "step": 339 + }, + { + "epoch": 0.03733801888864485, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 5.671319961547852, + "learning_rate": 1.240849194729136e-07, + "loss": 1.2461, + "mean_token_accuracy": 0.6439917087554932, + "num_tokens": 8781610.0, + "step": 340 + }, + { + "epoch": 0.03744783659125851, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 5.2279534339904785, + "learning_rate": 1.2445095168374817e-07, + "loss": 1.1798, + "mean_token_accuracy": 0.6568683981895447, + "num_tokens": 8805386.0, + "step": 341 + }, + { + "epoch": 0.03755765429387217, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 6.322206974029541, + "learning_rate": 1.2481698389458272e-07, + "loss": 1.3183, + "mean_token_accuracy": 0.6364336013793945, + "num_tokens": 8830009.0, + "step": 342 + }, + { + "epoch": 0.03766747199648583, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 7.470382213592529, + "learning_rate": 1.2518301610541727e-07, + "loss": 1.1884, + "mean_token_accuracy": 0.659083902835846, + "num_tokens": 8848499.0, + "step": 343 + }, + { + "epoch": 0.03777728969909949, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 6.55460262298584, + "learning_rate": 1.2554904831625182e-07, + "loss": 1.1633, + "mean_token_accuracy": 0.6645952463150024, + "num_tokens": 8869974.0, + "step": 344 + }, + { + "epoch": 0.03788710740171316, + "ewc_loss": 5.62518835067749e-07, + "grad_norm": 4.7976226806640625, + "learning_rate": 1.259150805270864e-07, + "loss": 1.2864, + "mean_token_accuracy": 0.6378072500228882, + "num_tokens": 8897952.0, + "step": 345 + }, + { + "epoch": 0.03799692510432682, + "ewc_loss": 5.62518835067749e-07, + "grad_norm": 5.17179536819458, + "learning_rate": 1.2628111273792094e-07, + "loss": 1.2107, + "mean_token_accuracy": 0.6606115102767944, + "num_tokens": 8931394.0, + "step": 346 + }, + { + "epoch": 0.03810674280694048, + "ewc_loss": 5.662441253662109e-07, + "grad_norm": 5.084400177001953, + "learning_rate": 1.2664714494875547e-07, + "loss": 1.261, + "mean_token_accuracy": 0.6478112936019897, + "num_tokens": 8962484.0, + "step": 347 + }, + { + "epoch": 0.03821656050955414, + "ewc_loss": 5.662441253662109e-07, + "grad_norm": 4.539783954620361, + "learning_rate": 1.2701317715959004e-07, + "loss": 1.2264, + "mean_token_accuracy": 0.6492651104927063, + "num_tokens": 8988810.0, + "step": 348 + }, + { + "epoch": 0.0383263782121678, + "ewc_loss": 5.736947059631348e-07, + "grad_norm": 7.465753078460693, + "learning_rate": 1.273792093704246e-07, + "loss": 1.1723, + "mean_token_accuracy": 0.6677128076553345, + "num_tokens": 9011086.0, + "step": 349 + }, + { + "epoch": 0.038436195914781465, + "ewc_loss": 5.736947059631348e-07, + "grad_norm": 5.490662097930908, + "learning_rate": 1.2774524158125917e-07, + "loss": 1.2475, + "mean_token_accuracy": 0.6506361365318298, + "num_tokens": 9038964.0, + "step": 350 + }, + { + "epoch": 0.038546013617395125, + "ewc_loss": 5.736947059631348e-07, + "grad_norm": 6.227256774902344, + "learning_rate": 1.281112737920937e-07, + "loss": 1.3167, + "mean_token_accuracy": 0.6302096247673035, + "num_tokens": 9065347.0, + "step": 351 + }, + { + "epoch": 0.038655831320008785, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 6.289591312408447, + "learning_rate": 1.2847730600292824e-07, + "loss": 1.1472, + "mean_token_accuracy": 0.675501823425293, + "num_tokens": 9089414.0, + "step": 352 + }, + { + "epoch": 0.038765649022622445, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 4.078950881958008, + "learning_rate": 1.288433382137628e-07, + "loss": 1.3124, + "mean_token_accuracy": 0.6360925436019897, + "num_tokens": 9120551.0, + "step": 353 + }, + { + "epoch": 0.038875466725236105, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 5.498574256896973, + "learning_rate": 1.2920937042459736e-07, + "loss": 1.2132, + "mean_token_accuracy": 0.6577107310295105, + "num_tokens": 9143936.0, + "step": 354 + }, + { + "epoch": 0.03898528442784977, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 5.470249652862549, + "learning_rate": 1.295754026354319e-07, + "loss": 1.2972, + "mean_token_accuracy": 0.6351332664489746, + "num_tokens": 9171765.0, + "step": 355 + }, + { + "epoch": 0.03909510213046343, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 5.514886379241943, + "learning_rate": 1.2994143484626646e-07, + "loss": 1.2519, + "mean_token_accuracy": 0.6457765102386475, + "num_tokens": 9198275.0, + "step": 356 + }, + { + "epoch": 0.03920491983307709, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 7.742037773132324, + "learning_rate": 1.3030746705710104e-07, + "loss": 1.2033, + "mean_token_accuracy": 0.6511165499687195, + "num_tokens": 9221218.0, + "step": 357 + }, + { + "epoch": 0.03931473753569075, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 5.331280708312988, + "learning_rate": 1.3067349926793558e-07, + "loss": 1.2035, + "mean_token_accuracy": 0.6499122381210327, + "num_tokens": 9249309.0, + "step": 358 + }, + { + "epoch": 0.03942455523830442, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 6.6855669021606445, + "learning_rate": 1.310395314787701e-07, + "loss": 1.2339, + "mean_token_accuracy": 0.6494282484054565, + "num_tokens": 9270701.0, + "step": 359 + }, + { + "epoch": 0.03953437294091808, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 6.12860631942749, + "learning_rate": 1.3140556368960468e-07, + "loss": 1.2663, + "mean_token_accuracy": 0.647269606590271, + "num_tokens": 9303156.0, + "step": 360 + }, + { + "epoch": 0.03964419064353174, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 6.280877113342285, + "learning_rate": 1.3177159590043923e-07, + "loss": 1.1618, + "mean_token_accuracy": 0.6729719638824463, + "num_tokens": 9325163.0, + "step": 361 + }, + { + "epoch": 0.0397540083461454, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 7.352733135223389, + "learning_rate": 1.321376281112738e-07, + "loss": 1.1352, + "mean_token_accuracy": 0.6759496927261353, + "num_tokens": 9349474.0, + "step": 362 + }, + { + "epoch": 0.03986382604875906, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 5.330562591552734, + "learning_rate": 1.3250366032210833e-07, + "loss": 1.1999, + "mean_token_accuracy": 0.6569935083389282, + "num_tokens": 9376227.0, + "step": 363 + }, + { + "epoch": 0.039973643751372724, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 6.0879058837890625, + "learning_rate": 1.3286969253294288e-07, + "loss": 1.1504, + "mean_token_accuracy": 0.668041467666626, + "num_tokens": 9398802.0, + "step": 364 + }, + { + "epoch": 0.040083461453986384, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 5.5991058349609375, + "learning_rate": 1.3323572474377745e-07, + "loss": 1.2323, + "mean_token_accuracy": 0.6523063778877258, + "num_tokens": 9425588.0, + "step": 365 + }, + { + "epoch": 0.040193279156600044, + "ewc_loss": 6.034970283508301e-07, + "grad_norm": 4.214214324951172, + "learning_rate": 1.33601756954612e-07, + "loss": 1.2832, + "mean_token_accuracy": 0.6323747038841248, + "num_tokens": 9457907.0, + "step": 366 + }, + { + "epoch": 0.040303096859213704, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 5.708691596984863, + "learning_rate": 1.3396778916544655e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6825323104858398, + "num_tokens": 9482988.0, + "step": 367 + }, + { + "epoch": 0.040412914561827364, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 6.666698932647705, + "learning_rate": 1.343338213762811e-07, + "loss": 1.2502, + "mean_token_accuracy": 0.6474542617797852, + "num_tokens": 9503553.0, + "step": 368 + }, + { + "epoch": 0.04052273226444103, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 4.941528797149658, + "learning_rate": 1.3469985358711565e-07, + "loss": 1.2488, + "mean_token_accuracy": 0.6496202349662781, + "num_tokens": 9528274.0, + "step": 369 + }, + { + "epoch": 0.04063254996705469, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 4.504311561584473, + "learning_rate": 1.3506588579795023e-07, + "loss": 1.21, + "mean_token_accuracy": 0.652092695236206, + "num_tokens": 9563887.0, + "step": 370 + }, + { + "epoch": 0.04074236766966835, + "ewc_loss": 6.183981895446777e-07, + "grad_norm": 6.670443534851074, + "learning_rate": 1.3543191800878475e-07, + "loss": 1.2611, + "mean_token_accuracy": 0.6471705436706543, + "num_tokens": 9586556.0, + "step": 371 + }, + { + "epoch": 0.04085218537228201, + "ewc_loss": 6.183981895446777e-07, + "grad_norm": 5.843034744262695, + "learning_rate": 1.3579795021961932e-07, + "loss": 1.2113, + "mean_token_accuracy": 0.6564457416534424, + "num_tokens": 9616303.0, + "step": 372 + }, + { + "epoch": 0.04096200307489567, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 4.324850559234619, + "learning_rate": 1.3616398243045387e-07, + "loss": 1.2362, + "mean_token_accuracy": 0.6516453623771667, + "num_tokens": 9645693.0, + "step": 373 + }, + { + "epoch": 0.04107182077750934, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 6.787140369415283, + "learning_rate": 1.3653001464128845e-07, + "loss": 1.2298, + "mean_token_accuracy": 0.6564438343048096, + "num_tokens": 9672371.0, + "step": 374 + }, + { + "epoch": 0.041181638480122996, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 5.059487819671631, + "learning_rate": 1.3689604685212297e-07, + "loss": 1.1916, + "mean_token_accuracy": 0.6715582013130188, + "num_tokens": 9705264.0, + "step": 375 + }, + { + "epoch": 0.041291456182736656, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 5.318646430969238, + "learning_rate": 1.3726207906295752e-07, + "loss": 1.134, + "mean_token_accuracy": 0.6757746934890747, + "num_tokens": 9729726.0, + "step": 376 + }, + { + "epoch": 0.041401273885350316, + "ewc_loss": 6.295740604400635e-07, + "grad_norm": 6.093007564544678, + "learning_rate": 1.376281112737921e-07, + "loss": 1.2581, + "mean_token_accuracy": 0.6530345678329468, + "num_tokens": 9756079.0, + "step": 377 + }, + { + "epoch": 0.04151109158796398, + "ewc_loss": 6.370246410369873e-07, + "grad_norm": 5.596731185913086, + "learning_rate": 1.3799414348462665e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6789920330047607, + "num_tokens": 9778317.0, + "step": 378 + }, + { + "epoch": 0.04162090929057764, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 4.819552421569824, + "learning_rate": 1.383601756954612e-07, + "loss": 1.2471, + "mean_token_accuracy": 0.6490230560302734, + "num_tokens": 9808114.0, + "step": 379 + }, + { + "epoch": 0.0417307269931913, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 4.335355758666992, + "learning_rate": 1.3872620790629574e-07, + "loss": 1.2268, + "mean_token_accuracy": 0.6429975032806396, + "num_tokens": 9833909.0, + "step": 380 + }, + { + "epoch": 0.04184054469580496, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 6.1398749351501465, + "learning_rate": 1.390922401171303e-07, + "loss": 1.1903, + "mean_token_accuracy": 0.6634188294410706, + "num_tokens": 9857091.0, + "step": 381 + }, + { + "epoch": 0.04195036239841862, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 5.585559844970703, + "learning_rate": 1.3945827232796487e-07, + "loss": 1.1552, + "mean_token_accuracy": 0.6659597754478455, + "num_tokens": 9880395.0, + "step": 382 + }, + { + "epoch": 0.04206018010103229, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 5.752450942993164, + "learning_rate": 1.398243045387994e-07, + "loss": 1.2777, + "mean_token_accuracy": 0.6408158540725708, + "num_tokens": 9904483.0, + "step": 383 + }, + { + "epoch": 0.04216999780364595, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 5.066919326782227, + "learning_rate": 1.4019033674963397e-07, + "loss": 1.1593, + "mean_token_accuracy": 0.6653457283973694, + "num_tokens": 9927865.0, + "step": 384 + }, + { + "epoch": 0.04227981550625961, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 6.398703575134277, + "learning_rate": 1.4055636896046852e-07, + "loss": 1.2409, + "mean_token_accuracy": 0.6551581025123596, + "num_tokens": 9956712.0, + "step": 385 + }, + { + "epoch": 0.04238963320887327, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 5.279623031616211, + "learning_rate": 1.409224011713031e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6899148225784302, + "num_tokens": 9979222.0, + "step": 386 + }, + { + "epoch": 0.04249945091148693, + "ewc_loss": 6.51925802230835e-07, + "grad_norm": 5.923497200012207, + "learning_rate": 1.4128843338213761e-07, + "loss": 1.2239, + "mean_token_accuracy": 0.647872805595398, + "num_tokens": 10007509.0, + "step": 387 + }, + { + "epoch": 0.042609268614100596, + "ewc_loss": 6.51925802230835e-07, + "grad_norm": 4.264092922210693, + "learning_rate": 1.4165446559297216e-07, + "loss": 1.2586, + "mean_token_accuracy": 0.6484504342079163, + "num_tokens": 10037020.0, + "step": 388 + }, + { + "epoch": 0.042719086316714255, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 4.913336753845215, + "learning_rate": 1.4202049780380674e-07, + "loss": 1.2808, + "mean_token_accuracy": 0.6390488147735596, + "num_tokens": 10062076.0, + "step": 389 + }, + { + "epoch": 0.042828904019327915, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 4.395426273345947, + "learning_rate": 1.423865300146413e-07, + "loss": 1.1955, + "mean_token_accuracy": 0.658979058265686, + "num_tokens": 10093034.0, + "step": 390 + }, + { + "epoch": 0.042938721721941575, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 5.254908561706543, + "learning_rate": 1.4275256222547584e-07, + "loss": 1.2267, + "mean_token_accuracy": 0.6577738523483276, + "num_tokens": 10117118.0, + "step": 391 + }, + { + "epoch": 0.043048539424555235, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 4.677454471588135, + "learning_rate": 1.4311859443631039e-07, + "loss": 1.2686, + "mean_token_accuracy": 0.643732488155365, + "num_tokens": 10151018.0, + "step": 392 + }, + { + "epoch": 0.0431583571271689, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 6.739322662353516, + "learning_rate": 1.4348462664714493e-07, + "loss": 1.1113, + "mean_token_accuracy": 0.6781661510467529, + "num_tokens": 10168012.0, + "step": 393 + }, + { + "epoch": 0.04326817482978256, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 6.170870304107666, + "learning_rate": 1.438506588579795e-07, + "loss": 1.2434, + "mean_token_accuracy": 0.6457309126853943, + "num_tokens": 10193267.0, + "step": 394 + }, + { + "epoch": 0.04337799253239622, + "ewc_loss": 6.593763828277588e-07, + "grad_norm": 4.656559467315674, + "learning_rate": 1.4421669106881403e-07, + "loss": 1.1496, + "mean_token_accuracy": 0.662921130657196, + "num_tokens": 10219769.0, + "step": 395 + }, + { + "epoch": 0.04348781023500988, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 6.767523765563965, + "learning_rate": 1.445827232796486e-07, + "loss": 1.2554, + "mean_token_accuracy": 0.6438827514648438, + "num_tokens": 10244899.0, + "step": 396 + }, + { + "epoch": 0.04359762793762355, + "ewc_loss": 6.631016731262207e-07, + "grad_norm": 7.704606056213379, + "learning_rate": 1.4494875549048316e-07, + "loss": 1.1952, + "mean_token_accuracy": 0.6597439050674438, + "num_tokens": 10263877.0, + "step": 397 + }, + { + "epoch": 0.04370744564023721, + "ewc_loss": 6.631016731262207e-07, + "grad_norm": 5.89137077331543, + "learning_rate": 1.4531478770131773e-07, + "loss": 1.2539, + "mean_token_accuracy": 0.6452513337135315, + "num_tokens": 10287450.0, + "step": 398 + }, + { + "epoch": 0.04381726334285087, + "ewc_loss": 6.705522537231445e-07, + "grad_norm": 5.184228420257568, + "learning_rate": 1.4568081991215226e-07, + "loss": 1.2095, + "mean_token_accuracy": 0.6642740964889526, + "num_tokens": 10314939.0, + "step": 399 + }, + { + "epoch": 0.04392708104546453, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 5.087530136108398, + "learning_rate": 1.460468521229868e-07, + "loss": 1.1001, + "mean_token_accuracy": 0.6850038766860962, + "num_tokens": 10340075.0, + "step": 400 + }, + { + "epoch": 0.04403689874807819, + "ewc_loss": 6.742775440216064e-07, + "grad_norm": 6.37636137008667, + "learning_rate": 1.4641288433382138e-07, + "loss": 1.2529, + "mean_token_accuracy": 0.651249349117279, + "num_tokens": 10364116.0, + "step": 401 + }, + { + "epoch": 0.044146716450691854, + "ewc_loss": 6.817281246185303e-07, + "grad_norm": 4.592787742614746, + "learning_rate": 1.4677891654465593e-07, + "loss": 1.2643, + "mean_token_accuracy": 0.6493604183197021, + "num_tokens": 10391839.0, + "step": 402 + }, + { + "epoch": 0.044256534153305514, + "ewc_loss": 6.891787052154541e-07, + "grad_norm": 4.611002445220947, + "learning_rate": 1.4714494875549048e-07, + "loss": 1.1995, + "mean_token_accuracy": 0.6717666387557983, + "num_tokens": 10419954.0, + "step": 403 + }, + { + "epoch": 0.044366351855919174, + "ewc_loss": 6.891787052154541e-07, + "grad_norm": 5.761373996734619, + "learning_rate": 1.4751098096632503e-07, + "loss": 1.2611, + "mean_token_accuracy": 0.6433578729629517, + "num_tokens": 10444143.0, + "step": 404 + }, + { + "epoch": 0.044476169558532834, + "ewc_loss": 6.92903995513916e-07, + "grad_norm": 5.356876850128174, + "learning_rate": 1.4787701317715958e-07, + "loss": 1.1941, + "mean_token_accuracy": 0.6595566868782043, + "num_tokens": 10473697.0, + "step": 405 + }, + { + "epoch": 0.044585987261146494, + "ewc_loss": 6.966292858123779e-07, + "grad_norm": 5.020455360412598, + "learning_rate": 1.4824304538799415e-07, + "loss": 1.1878, + "mean_token_accuracy": 0.6636152863502502, + "num_tokens": 10502612.0, + "step": 406 + }, + { + "epoch": 0.04469580496376016, + "ewc_loss": 6.966292858123779e-07, + "grad_norm": 5.039488792419434, + "learning_rate": 1.4860907759882867e-07, + "loss": 1.231, + "mean_token_accuracy": 0.6498821973800659, + "num_tokens": 10529266.0, + "step": 407 + }, + { + "epoch": 0.04480562266637382, + "ewc_loss": 7.003545761108398e-07, + "grad_norm": 6.211939334869385, + "learning_rate": 1.4897510980966325e-07, + "loss": 1.2113, + "mean_token_accuracy": 0.654626190662384, + "num_tokens": 10550616.0, + "step": 408 + }, + { + "epoch": 0.04491544036898748, + "ewc_loss": 7.040798664093018e-07, + "grad_norm": 4.93683385848999, + "learning_rate": 1.493411420204978e-07, + "loss": 1.1748, + "mean_token_accuracy": 0.6646691560745239, + "num_tokens": 10582682.0, + "step": 409 + }, + { + "epoch": 0.04502525807160114, + "ewc_loss": 7.040798664093018e-07, + "grad_norm": 4.5524773597717285, + "learning_rate": 1.4970717423133235e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6798273921012878, + "num_tokens": 10606229.0, + "step": 410 + }, + { + "epoch": 0.0451350757742148, + "ewc_loss": 7.040798664093018e-07, + "grad_norm": 4.703146934509277, + "learning_rate": 1.500732064421669e-07, + "loss": 1.2519, + "mean_token_accuracy": 0.6446754932403564, + "num_tokens": 10639889.0, + "step": 411 + }, + { + "epoch": 0.04524489347682847, + "ewc_loss": 7.152557373046875e-07, + "grad_norm": 5.115393161773682, + "learning_rate": 1.5043923865300145e-07, + "loss": 1.2586, + "mean_token_accuracy": 0.6386842131614685, + "num_tokens": 10664352.0, + "step": 412 + }, + { + "epoch": 0.04535471117944213, + "ewc_loss": 7.227063179016113e-07, + "grad_norm": 4.437222003936768, + "learning_rate": 1.5080527086383602e-07, + "loss": 1.1676, + "mean_token_accuracy": 0.669095516204834, + "num_tokens": 10692255.0, + "step": 413 + }, + { + "epoch": 0.04546452888205579, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 6.5936665534973145, + "learning_rate": 1.5117130307467057e-07, + "loss": 1.1924, + "mean_token_accuracy": 0.6654262542724609, + "num_tokens": 10713360.0, + "step": 414 + }, + { + "epoch": 0.045574346584669446, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 5.26373291015625, + "learning_rate": 1.5153733528550512e-07, + "loss": 1.1788, + "mean_token_accuracy": 0.6659798622131348, + "num_tokens": 10743290.0, + "step": 415 + }, + { + "epoch": 0.04568416428728311, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 5.595615863800049, + "learning_rate": 1.5190336749633967e-07, + "loss": 1.1952, + "mean_token_accuracy": 0.6542649269104004, + "num_tokens": 10771567.0, + "step": 416 + }, + { + "epoch": 0.04579398198989677, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 4.246188163757324, + "learning_rate": 1.5226939970717422e-07, + "loss": 1.1574, + "mean_token_accuracy": 0.6812356114387512, + "num_tokens": 10796435.0, + "step": 417 + }, + { + "epoch": 0.04590379969251043, + "ewc_loss": 7.301568984985352e-07, + "grad_norm": 4.870153903961182, + "learning_rate": 1.526354319180088e-07, + "loss": 1.2214, + "mean_token_accuracy": 0.6628368496894836, + "num_tokens": 10826350.0, + "step": 418 + }, + { + "epoch": 0.04601361739512409, + "ewc_loss": 7.338821887969971e-07, + "grad_norm": 4.909542560577393, + "learning_rate": 1.5300146412884332e-07, + "loss": 1.1585, + "mean_token_accuracy": 0.6684927940368652, + "num_tokens": 10851407.0, + "step": 419 + }, + { + "epoch": 0.04612343509773775, + "ewc_loss": 7.37607479095459e-07, + "grad_norm": 5.005578994750977, + "learning_rate": 1.533674963396779e-07, + "loss": 1.3109, + "mean_token_accuracy": 0.6451378464698792, + "num_tokens": 10885508.0, + "step": 420 + }, + { + "epoch": 0.04623325280035142, + "ewc_loss": 7.37607479095459e-07, + "grad_norm": 4.731533050537109, + "learning_rate": 1.5373352855051244e-07, + "loss": 1.1925, + "mean_token_accuracy": 0.6547421813011169, + "num_tokens": 10911158.0, + "step": 421 + }, + { + "epoch": 0.04634307050296508, + "ewc_loss": 7.450580596923828e-07, + "grad_norm": 5.565402507781982, + "learning_rate": 1.54099560761347e-07, + "loss": 1.1279, + "mean_token_accuracy": 0.675905168056488, + "num_tokens": 10932147.0, + "step": 422 + }, + { + "epoch": 0.04645288820557874, + "ewc_loss": 7.487833499908447e-07, + "grad_norm": 5.384645462036133, + "learning_rate": 1.5446559297218154e-07, + "loss": 1.1805, + "mean_token_accuracy": 0.6633744835853577, + "num_tokens": 10952566.0, + "step": 423 + }, + { + "epoch": 0.0465627059081924, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 7.265563488006592, + "learning_rate": 1.548316251830161e-07, + "loss": 1.1503, + "mean_token_accuracy": 0.6699749231338501, + "num_tokens": 10971181.0, + "step": 424 + }, + { + "epoch": 0.04667252361080606, + "ewc_loss": 7.487833499908447e-07, + "grad_norm": 4.682273864746094, + "learning_rate": 1.5519765739385066e-07, + "loss": 1.1125, + "mean_token_accuracy": 0.6793301105499268, + "num_tokens": 10991562.0, + "step": 425 + }, + { + "epoch": 0.046782341313419726, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 4.352231979370117, + "learning_rate": 1.5556368960468521e-07, + "loss": 1.1868, + "mean_token_accuracy": 0.6591984629631042, + "num_tokens": 11016836.0, + "step": 426 + }, + { + "epoch": 0.046892159016033386, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 4.177327632904053, + "learning_rate": 1.5592972181551974e-07, + "loss": 1.23, + "mean_token_accuracy": 0.6502348184585571, + "num_tokens": 11040431.0, + "step": 427 + }, + { + "epoch": 0.047001976718647046, + "ewc_loss": 7.525086402893066e-07, + "grad_norm": 4.241857051849365, + "learning_rate": 1.562957540263543e-07, + "loss": 1.1768, + "mean_token_accuracy": 0.6648961901664734, + "num_tokens": 11065612.0, + "step": 428 + }, + { + "epoch": 0.047111794421260705, + "ewc_loss": 7.562339305877686e-07, + "grad_norm": 5.87527322769165, + "learning_rate": 1.5666178623718886e-07, + "loss": 1.1252, + "mean_token_accuracy": 0.6821317672729492, + "num_tokens": 11090027.0, + "step": 429 + }, + { + "epoch": 0.047221612123874365, + "ewc_loss": 7.599592208862305e-07, + "grad_norm": 3.949287176132202, + "learning_rate": 1.5702781844802344e-07, + "loss": 1.2907, + "mean_token_accuracy": 0.6342899799346924, + "num_tokens": 11121097.0, + "step": 430 + }, + { + "epoch": 0.04733142982648803, + "ewc_loss": 7.599592208862305e-07, + "grad_norm": 5.662450313568115, + "learning_rate": 1.5739385065885796e-07, + "loss": 1.2064, + "mean_token_accuracy": 0.6533792018890381, + "num_tokens": 11144282.0, + "step": 431 + }, + { + "epoch": 0.04744124752910169, + "ewc_loss": 7.711350917816162e-07, + "grad_norm": 5.335479736328125, + "learning_rate": 1.5775988286969253e-07, + "loss": 1.2086, + "mean_token_accuracy": 0.6602216958999634, + "num_tokens": 11172508.0, + "step": 432 + }, + { + "epoch": 0.04755106523171535, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 5.337142467498779, + "learning_rate": 1.5812591508052708e-07, + "loss": 1.159, + "mean_token_accuracy": 0.6713317632675171, + "num_tokens": 11194936.0, + "step": 433 + }, + { + "epoch": 0.04766088293432901, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 4.276597499847412, + "learning_rate": 1.5849194729136163e-07, + "loss": 1.0606, + "mean_token_accuracy": 0.6917989253997803, + "num_tokens": 11220873.0, + "step": 434 + }, + { + "epoch": 0.04777070063694268, + "ewc_loss": 7.7858567237854e-07, + "grad_norm": 4.386407375335693, + "learning_rate": 1.5885797950219618e-07, + "loss": 1.2029, + "mean_token_accuracy": 0.6559885740280151, + "num_tokens": 11245691.0, + "step": 435 + }, + { + "epoch": 0.04788051833955634, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 4.034515857696533, + "learning_rate": 1.5922401171303073e-07, + "loss": 1.2497, + "mean_token_accuracy": 0.6629741191864014, + "num_tokens": 11272386.0, + "step": 436 + }, + { + "epoch": 0.04799033604217, + "ewc_loss": 7.897615432739258e-07, + "grad_norm": 4.625980377197266, + "learning_rate": 1.595900439238653e-07, + "loss": 1.2561, + "mean_token_accuracy": 0.6478095650672913, + "num_tokens": 11301211.0, + "step": 437 + }, + { + "epoch": 0.04810015374478366, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 4.169581890106201, + "learning_rate": 1.5995607613469986e-07, + "loss": 1.2475, + "mean_token_accuracy": 0.6520504951477051, + "num_tokens": 11331253.0, + "step": 438 + }, + { + "epoch": 0.04820997144739732, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 3.8196463584899902, + "learning_rate": 1.6032210834553438e-07, + "loss": 1.1703, + "mean_token_accuracy": 0.6748536825180054, + "num_tokens": 11360900.0, + "step": 439 + }, + { + "epoch": 0.048319789150010985, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 4.340291500091553, + "learning_rate": 1.6068814055636895e-07, + "loss": 1.2837, + "mean_token_accuracy": 0.6363709568977356, + "num_tokens": 11391599.0, + "step": 440 + }, + { + "epoch": 0.048429606852624645, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 3.8822553157806396, + "learning_rate": 1.610541727672035e-07, + "loss": 1.2085, + "mean_token_accuracy": 0.65887451171875, + "num_tokens": 11424019.0, + "step": 441 + }, + { + "epoch": 0.048539424555238304, + "ewc_loss": 7.860362529754639e-07, + "grad_norm": 4.3661394119262695, + "learning_rate": 1.6142020497803808e-07, + "loss": 1.1799, + "mean_token_accuracy": 0.6570247411727905, + "num_tokens": 11446707.0, + "step": 442 + }, + { + "epoch": 0.048649242257851964, + "ewc_loss": 7.860362529754639e-07, + "grad_norm": 4.690710544586182, + "learning_rate": 1.617862371888726e-07, + "loss": 1.1371, + "mean_token_accuracy": 0.6703434586524963, + "num_tokens": 11467310.0, + "step": 443 + }, + { + "epoch": 0.048759059960465624, + "ewc_loss": 7.934868335723877e-07, + "grad_norm": 3.834899425506592, + "learning_rate": 1.6215226939970718e-07, + "loss": 1.1899, + "mean_token_accuracy": 0.6602822542190552, + "num_tokens": 11493973.0, + "step": 444 + }, + { + "epoch": 0.04886887766307929, + "ewc_loss": 8.009374141693115e-07, + "grad_norm": 5.664656639099121, + "learning_rate": 1.6251830161054173e-07, + "loss": 1.1292, + "mean_token_accuracy": 0.6836904287338257, + "num_tokens": 11514611.0, + "step": 445 + }, + { + "epoch": 0.04897869536569295, + "ewc_loss": 8.121132850646973e-07, + "grad_norm": 4.6985249519348145, + "learning_rate": 1.6288433382137627e-07, + "loss": 1.2738, + "mean_token_accuracy": 0.646410346031189, + "num_tokens": 11538158.0, + "step": 446 + }, + { + "epoch": 0.04908851306830661, + "ewc_loss": 8.158385753631592e-07, + "grad_norm": 4.531350612640381, + "learning_rate": 1.6325036603221082e-07, + "loss": 1.1893, + "mean_token_accuracy": 0.6622310876846313, + "num_tokens": 11570295.0, + "step": 447 + }, + { + "epoch": 0.04919833077092027, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 4.898030757904053, + "learning_rate": 1.6361639824304537e-07, + "loss": 1.1591, + "mean_token_accuracy": 0.6707313060760498, + "num_tokens": 11597780.0, + "step": 448 + }, + { + "epoch": 0.04930814847353393, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 4.306581974029541, + "learning_rate": 1.6398243045387995e-07, + "loss": 1.215, + "mean_token_accuracy": 0.6609763503074646, + "num_tokens": 11623199.0, + "step": 449 + }, + { + "epoch": 0.0494179661761476, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 4.165746212005615, + "learning_rate": 1.643484626647145e-07, + "loss": 1.2509, + "mean_token_accuracy": 0.6400126814842224, + "num_tokens": 11651657.0, + "step": 450 + }, + { + "epoch": 0.04952778387876126, + "ewc_loss": 8.121132850646973e-07, + "grad_norm": 3.872346878051758, + "learning_rate": 1.6471449487554902e-07, + "loss": 1.1892, + "mean_token_accuracy": 0.6581152081489563, + "num_tokens": 11680381.0, + "step": 451 + }, + { + "epoch": 0.04963760158137492, + "ewc_loss": 8.158385753631592e-07, + "grad_norm": 4.026247501373291, + "learning_rate": 1.650805270863836e-07, + "loss": 1.147, + "mean_token_accuracy": 0.6714566946029663, + "num_tokens": 11705013.0, + "step": 452 + }, + { + "epoch": 0.04974741928398858, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.625300407409668, + "learning_rate": 1.6544655929721814e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6854815483093262, + "num_tokens": 11731140.0, + "step": 453 + }, + { + "epoch": 0.049857236986602244, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.038361072540283, + "learning_rate": 1.6581259150805272e-07, + "loss": 1.2976, + "mean_token_accuracy": 0.6348986625671387, + "num_tokens": 11761261.0, + "step": 454 + }, + { + "epoch": 0.0499670546892159, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.5958123207092285, + "learning_rate": 1.6617862371888724e-07, + "loss": 1.1686, + "mean_token_accuracy": 0.6738181114196777, + "num_tokens": 11787241.0, + "step": 455 + }, + { + "epoch": 0.05007687239182956, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 5.191490173339844, + "learning_rate": 1.6654465592972182e-07, + "loss": 1.203, + "mean_token_accuracy": 0.6601951718330383, + "num_tokens": 11808564.0, + "step": 456 + }, + { + "epoch": 0.05018669009444322, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.458142280578613, + "learning_rate": 1.6691068814055637e-07, + "loss": 1.1733, + "mean_token_accuracy": 0.6679567098617554, + "num_tokens": 11835170.0, + "step": 457 + }, + { + "epoch": 0.05029650779705688, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.262616157531738, + "learning_rate": 1.6727672035139092e-07, + "loss": 1.1754, + "mean_token_accuracy": 0.6681909561157227, + "num_tokens": 11862838.0, + "step": 458 + }, + { + "epoch": 0.05040632549967055, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 4.344353675842285, + "learning_rate": 1.6764275256222547e-07, + "loss": 1.3056, + "mean_token_accuracy": 0.6396126747131348, + "num_tokens": 11892865.0, + "step": 459 + }, + { + "epoch": 0.05051614320228421, + "ewc_loss": 8.307397365570068e-07, + "grad_norm": 4.851067543029785, + "learning_rate": 1.6800878477306001e-07, + "loss": 1.1215, + "mean_token_accuracy": 0.6804096698760986, + "num_tokens": 11913385.0, + "step": 460 + }, + { + "epoch": 0.05062596090489787, + "ewc_loss": 8.307397365570068e-07, + "grad_norm": 4.689693450927734, + "learning_rate": 1.683748169838946e-07, + "loss": 1.2183, + "mean_token_accuracy": 0.6533433198928833, + "num_tokens": 11942433.0, + "step": 461 + }, + { + "epoch": 0.05073577860751153, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 5.251265525817871, + "learning_rate": 1.6874084919472914e-07, + "loss": 1.191, + "mean_token_accuracy": 0.6570310592651367, + "num_tokens": 11968602.0, + "step": 462 + }, + { + "epoch": 0.05084559631012519, + "ewc_loss": 8.381903171539307e-07, + "grad_norm": 4.6558332443237305, + "learning_rate": 1.6910688140556366e-07, + "loss": 1.1782, + "mean_token_accuracy": 0.6655294895172119, + "num_tokens": 11989652.0, + "step": 463 + }, + { + "epoch": 0.050955414012738856, + "ewc_loss": 8.381903171539307e-07, + "grad_norm": 4.3105549812316895, + "learning_rate": 1.6947291361639824e-07, + "loss": 1.2088, + "mean_token_accuracy": 0.6498585939407349, + "num_tokens": 12014428.0, + "step": 464 + }, + { + "epoch": 0.051065231715352516, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 4.221260070800781, + "learning_rate": 1.6983894582723279e-07, + "loss": 1.1093, + "mean_token_accuracy": 0.6776303052902222, + "num_tokens": 12041211.0, + "step": 465 + }, + { + "epoch": 0.051175049417966176, + "ewc_loss": 8.456408977508545e-07, + "grad_norm": 3.9981706142425537, + "learning_rate": 1.7020497803806736e-07, + "loss": 1.2389, + "mean_token_accuracy": 0.6595813035964966, + "num_tokens": 12071813.0, + "step": 466 + }, + { + "epoch": 0.051284867120579836, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 4.0353569984436035, + "learning_rate": 1.7057101024890188e-07, + "loss": 1.2398, + "mean_token_accuracy": 0.6498028039932251, + "num_tokens": 12099495.0, + "step": 467 + }, + { + "epoch": 0.051394684823193496, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 4.3166375160217285, + "learning_rate": 1.7093704245973643e-07, + "loss": 1.2378, + "mean_token_accuracy": 0.644794762134552, + "num_tokens": 12125207.0, + "step": 468 + }, + { + "epoch": 0.05150450252580716, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 5.451262474060059, + "learning_rate": 1.71303074670571e-07, + "loss": 1.1948, + "mean_token_accuracy": 0.6515969038009644, + "num_tokens": 12146398.0, + "step": 469 + }, + { + "epoch": 0.05161432022842082, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 4.661357402801514, + "learning_rate": 1.7166910688140556e-07, + "loss": 1.0944, + "mean_token_accuracy": 0.6797609925270081, + "num_tokens": 12170215.0, + "step": 470 + }, + { + "epoch": 0.05172413793103448, + "ewc_loss": 8.493661880493164e-07, + "grad_norm": 4.783364772796631, + "learning_rate": 1.720351390922401e-07, + "loss": 1.2461, + "mean_token_accuracy": 0.6442868709564209, + "num_tokens": 12192610.0, + "step": 471 + }, + { + "epoch": 0.05183395563364814, + "ewc_loss": 8.642673492431641e-07, + "grad_norm": 3.9238929748535156, + "learning_rate": 1.7240117130307466e-07, + "loss": 1.1903, + "mean_token_accuracy": 0.6650940775871277, + "num_tokens": 12220138.0, + "step": 472 + }, + { + "epoch": 0.05194377333626181, + "ewc_loss": 8.642673492431641e-07, + "grad_norm": 4.478322982788086, + "learning_rate": 1.7276720351390923e-07, + "loss": 1.1981, + "mean_token_accuracy": 0.6603453755378723, + "num_tokens": 12251326.0, + "step": 473 + }, + { + "epoch": 0.05205359103887547, + "ewc_loss": 8.717179298400879e-07, + "grad_norm": 4.084211349487305, + "learning_rate": 1.7313323572474378e-07, + "loss": 1.2564, + "mean_token_accuracy": 0.6463930606842041, + "num_tokens": 12280837.0, + "step": 474 + }, + { + "epoch": 0.05216340874148913, + "ewc_loss": 8.754432201385498e-07, + "grad_norm": 3.915600299835205, + "learning_rate": 1.734992679355783e-07, + "loss": 1.0777, + "mean_token_accuracy": 0.690098226070404, + "num_tokens": 12304158.0, + "step": 475 + }, + { + "epoch": 0.05227322644410279, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 4.390707492828369, + "learning_rate": 1.7386530014641288e-07, + "loss": 1.2531, + "mean_token_accuracy": 0.6427589654922485, + "num_tokens": 12331193.0, + "step": 476 + }, + { + "epoch": 0.05238304414671645, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 4.23602819442749, + "learning_rate": 1.7423133235724743e-07, + "loss": 1.2339, + "mean_token_accuracy": 0.6531134843826294, + "num_tokens": 12357251.0, + "step": 477 + }, + { + "epoch": 0.052492861849330115, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 3.935615301132202, + "learning_rate": 1.74597364568082e-07, + "loss": 1.2486, + "mean_token_accuracy": 0.6470775604248047, + "num_tokens": 12384863.0, + "step": 478 + }, + { + "epoch": 0.052602679551943775, + "ewc_loss": 8.791685104370117e-07, + "grad_norm": 3.6659560203552246, + "learning_rate": 1.7496339677891653e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.6802610158920288, + "num_tokens": 12415065.0, + "step": 479 + }, + { + "epoch": 0.052712497254557435, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 4.337815284729004, + "learning_rate": 1.7532942898975108e-07, + "loss": 1.1217, + "mean_token_accuracy": 0.6752251386642456, + "num_tokens": 12439060.0, + "step": 480 + }, + { + "epoch": 0.052822314957171095, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 4.003812313079834, + "learning_rate": 1.7569546120058565e-07, + "loss": 1.13, + "mean_token_accuracy": 0.6702727675437927, + "num_tokens": 12462856.0, + "step": 481 + }, + { + "epoch": 0.052932132659784754, + "ewc_loss": 8.903443813323975e-07, + "grad_norm": 4.418869972229004, + "learning_rate": 1.760614934114202e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6798557043075562, + "num_tokens": 12487168.0, + "step": 482 + }, + { + "epoch": 0.05304195036239842, + "ewc_loss": 8.977949619293213e-07, + "grad_norm": 4.371679782867432, + "learning_rate": 1.7642752562225475e-07, + "loss": 1.2549, + "mean_token_accuracy": 0.642861008644104, + "num_tokens": 12516282.0, + "step": 483 + }, + { + "epoch": 0.05315176806501208, + "ewc_loss": 8.977949619293213e-07, + "grad_norm": 4.141193389892578, + "learning_rate": 1.767935578330893e-07, + "loss": 1.1145, + "mean_token_accuracy": 0.6788681745529175, + "num_tokens": 12541665.0, + "step": 484 + }, + { + "epoch": 0.05326158576762574, + "ewc_loss": 9.052455425262451e-07, + "grad_norm": 3.926374912261963, + "learning_rate": 1.7715959004392387e-07, + "loss": 1.2447, + "mean_token_accuracy": 0.6540967226028442, + "num_tokens": 12573589.0, + "step": 485 + }, + { + "epoch": 0.0533714034702394, + "ewc_loss": 9.126961231231689e-07, + "grad_norm": 3.7681241035461426, + "learning_rate": 1.7752562225475842e-07, + "loss": 1.1391, + "mean_token_accuracy": 0.6720215082168579, + "num_tokens": 12601106.0, + "step": 486 + }, + { + "epoch": 0.05348122117285306, + "ewc_loss": 9.126961231231689e-07, + "grad_norm": 4.067622661590576, + "learning_rate": 1.7789165446559295e-07, + "loss": 1.1618, + "mean_token_accuracy": 0.6705989837646484, + "num_tokens": 12631770.0, + "step": 487 + }, + { + "epoch": 0.05359103887546673, + "ewc_loss": 9.164214134216309e-07, + "grad_norm": 4.174908638000488, + "learning_rate": 1.7825768667642752e-07, + "loss": 1.1462, + "mean_token_accuracy": 0.6618035435676575, + "num_tokens": 12655234.0, + "step": 488 + }, + { + "epoch": 0.05370085657808039, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 3.414440870285034, + "learning_rate": 1.7862371888726207e-07, + "loss": 1.1958, + "mean_token_accuracy": 0.6588654518127441, + "num_tokens": 12682179.0, + "step": 489 + }, + { + "epoch": 0.05381067428069405, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 3.851504325866699, + "learning_rate": 1.7898975109809665e-07, + "loss": 1.1571, + "mean_token_accuracy": 0.6671660542488098, + "num_tokens": 12711843.0, + "step": 490 + }, + { + "epoch": 0.05392049198330771, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 3.833345651626587, + "learning_rate": 1.7935578330893117e-07, + "loss": 1.1983, + "mean_token_accuracy": 0.6564195156097412, + "num_tokens": 12741567.0, + "step": 491 + }, + { + "epoch": 0.054030309685921374, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 4.098838806152344, + "learning_rate": 1.7972181551976572e-07, + "loss": 1.1982, + "mean_token_accuracy": 0.6616599559783936, + "num_tokens": 12767598.0, + "step": 492 + }, + { + "epoch": 0.054140127388535034, + "ewc_loss": 9.313225746154785e-07, + "grad_norm": 3.970040798187256, + "learning_rate": 1.800878477306003e-07, + "loss": 1.152, + "mean_token_accuracy": 0.6739300489425659, + "num_tokens": 12790026.0, + "step": 493 + }, + { + "epoch": 0.054249945091148694, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 3.347438097000122, + "learning_rate": 1.8045387994143484e-07, + "loss": 1.1573, + "mean_token_accuracy": 0.6723905801773071, + "num_tokens": 12818197.0, + "step": 494 + }, + { + "epoch": 0.05435976279376235, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 4.5827789306640625, + "learning_rate": 1.808199121522694e-07, + "loss": 1.1677, + "mean_token_accuracy": 0.6634361147880554, + "num_tokens": 12840299.0, + "step": 495 + }, + { + "epoch": 0.05446958049637601, + "ewc_loss": 9.350478649139404e-07, + "grad_norm": 4.548064708709717, + "learning_rate": 1.8118594436310394e-07, + "loss": 1.2437, + "mean_token_accuracy": 0.6457350850105286, + "num_tokens": 12865428.0, + "step": 496 + }, + { + "epoch": 0.05457939819898968, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 3.579904079437256, + "learning_rate": 1.8155197657393852e-07, + "loss": 1.1579, + "mean_token_accuracy": 0.674666166305542, + "num_tokens": 12892978.0, + "step": 497 + }, + { + "epoch": 0.05468921590160334, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 3.4856903553009033, + "learning_rate": 1.8191800878477307e-07, + "loss": 1.2035, + "mean_token_accuracy": 0.6677690744400024, + "num_tokens": 12921515.0, + "step": 498 + }, + { + "epoch": 0.054799033604217, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 4.005074977874756, + "learning_rate": 1.822840409956076e-07, + "loss": 1.1866, + "mean_token_accuracy": 0.663650631904602, + "num_tokens": 12947231.0, + "step": 499 + }, + { + "epoch": 0.05490885130683066, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 3.9455459117889404, + "learning_rate": 1.8265007320644216e-07, + "loss": 1.1964, + "mean_token_accuracy": 0.6570917963981628, + "num_tokens": 12975796.0, + "step": 500 + }, + { + "epoch": 0.05501866900944432, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 3.2674596309661865, + "learning_rate": 1.830161054172767e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6772549748420715, + "num_tokens": 13005740.0, + "step": 501 + }, + { + "epoch": 0.055128486712057986, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 3.9007833003997803, + "learning_rate": 1.833821376281113e-07, + "loss": 1.1669, + "mean_token_accuracy": 0.6591212749481201, + "num_tokens": 13029608.0, + "step": 502 + }, + { + "epoch": 0.055238304414671646, + "ewc_loss": 9.5367431640625e-07, + "grad_norm": 4.342975616455078, + "learning_rate": 1.837481698389458e-07, + "loss": 1.0764, + "mean_token_accuracy": 0.6907483339309692, + "num_tokens": 13050822.0, + "step": 503 + }, + { + "epoch": 0.055348122117285306, + "ewc_loss": 9.5367431640625e-07, + "grad_norm": 3.744807243347168, + "learning_rate": 1.8411420204978036e-07, + "loss": 1.0, + "mean_token_accuracy": 0.7032593488693237, + "num_tokens": 13076150.0, + "step": 504 + }, + { + "epoch": 0.055457939819898966, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 3.6997313499450684, + "learning_rate": 1.8448023426061494e-07, + "loss": 1.2183, + "mean_token_accuracy": 0.651647686958313, + "num_tokens": 13103581.0, + "step": 505 + }, + { + "epoch": 0.055567757522512626, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 3.825735569000244, + "learning_rate": 1.8484626647144948e-07, + "loss": 1.1506, + "mean_token_accuracy": 0.6628574728965759, + "num_tokens": 13131904.0, + "step": 506 + }, + { + "epoch": 0.05567757522512629, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 4.279286861419678, + "learning_rate": 1.8521229868228403e-07, + "loss": 1.0566, + "mean_token_accuracy": 0.6900294423103333, + "num_tokens": 13155934.0, + "step": 507 + }, + { + "epoch": 0.05578739292773995, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 3.8649964332580566, + "learning_rate": 1.8557833089311858e-07, + "loss": 1.113, + "mean_token_accuracy": 0.677303671836853, + "num_tokens": 13179650.0, + "step": 508 + }, + { + "epoch": 0.05589721063035361, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 3.905992269515991, + "learning_rate": 1.8594436310395313e-07, + "loss": 1.2039, + "mean_token_accuracy": 0.6539415121078491, + "num_tokens": 13205174.0, + "step": 509 + }, + { + "epoch": 0.05600702833296727, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 3.7348108291625977, + "learning_rate": 1.863103953147877e-07, + "loss": 1.1462, + "mean_token_accuracy": 0.6668096780776978, + "num_tokens": 13231544.0, + "step": 510 + }, + { + "epoch": 0.05611684603558094, + "ewc_loss": 9.760260581970215e-07, + "grad_norm": 4.406200885772705, + "learning_rate": 1.8667642752562223e-07, + "loss": 1.1746, + "mean_token_accuracy": 0.6619477868080139, + "num_tokens": 13252845.0, + "step": 511 + }, + { + "epoch": 0.0562266637381946, + "ewc_loss": 9.760260581970215e-07, + "grad_norm": 3.6735775470733643, + "learning_rate": 1.870424597364568e-07, + "loss": 1.2696, + "mean_token_accuracy": 0.6381828784942627, + "num_tokens": 13285009.0, + "step": 512 + }, + { + "epoch": 0.05633648144080826, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.536172389984131, + "learning_rate": 1.8740849194729135e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6882994771003723, + "num_tokens": 13312518.0, + "step": 513 + }, + { + "epoch": 0.05644629914342192, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.635209083557129, + "learning_rate": 1.8777452415812593e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.6887027621269226, + "num_tokens": 13335578.0, + "step": 514 + }, + { + "epoch": 0.05655611684603558, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.1390762329101562, + "learning_rate": 1.8814055636896045e-07, + "loss": 1.1944, + "mean_token_accuracy": 0.6663201451301575, + "num_tokens": 13368449.0, + "step": 515 + }, + { + "epoch": 0.056665934548649245, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 4.396877765655518, + "learning_rate": 1.88506588579795e-07, + "loss": 1.2245, + "mean_token_accuracy": 0.6540230512619019, + "num_tokens": 13397250.0, + "step": 516 + }, + { + "epoch": 0.056775752251262905, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.80806827545166, + "learning_rate": 1.8887262079062958e-07, + "loss": 1.1641, + "mean_token_accuracy": 0.6575118899345398, + "num_tokens": 13422857.0, + "step": 517 + }, + { + "epoch": 0.056885569953876565, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.742133378982544, + "learning_rate": 1.8923865300146413e-07, + "loss": 1.09, + "mean_token_accuracy": 0.6772218942642212, + "num_tokens": 13447180.0, + "step": 518 + }, + { + "epoch": 0.056995387656490225, + "ewc_loss": 9.909272193908691e-07, + "grad_norm": 4.339288234710693, + "learning_rate": 1.8960468521229868e-07, + "loss": 1.2421, + "mean_token_accuracy": 0.6395629644393921, + "num_tokens": 13469860.0, + "step": 519 + }, + { + "epoch": 0.057105205359103885, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.604748249053955, + "learning_rate": 1.8997071742313322e-07, + "loss": 1.1895, + "mean_token_accuracy": 0.6686891317367554, + "num_tokens": 13495606.0, + "step": 520 + }, + { + "epoch": 0.05721502306171755, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.3522562980651855, + "learning_rate": 1.9033674963396777e-07, + "loss": 1.1708, + "mean_token_accuracy": 0.6657407283782959, + "num_tokens": 13527790.0, + "step": 521 + }, + { + "epoch": 0.05732484076433121, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.5198616981506348, + "learning_rate": 1.9070278184480235e-07, + "loss": 1.1523, + "mean_token_accuracy": 0.6779863238334656, + "num_tokens": 13551986.0, + "step": 522 + }, + { + "epoch": 0.05743465846694487, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 4.186723709106445, + "learning_rate": 1.9106881405563687e-07, + "loss": 1.1995, + "mean_token_accuracy": 0.6567040681838989, + "num_tokens": 13577954.0, + "step": 523 + }, + { + "epoch": 0.05754447616955853, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.912282705307007, + "learning_rate": 1.9143484626647145e-07, + "loss": 1.1989, + "mean_token_accuracy": 0.6541672348976135, + "num_tokens": 13604870.0, + "step": 524 + }, + { + "epoch": 0.05765429387217219, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.3105549812316895, + "learning_rate": 1.91800878477306e-07, + "loss": 1.2149, + "mean_token_accuracy": 0.659574031829834, + "num_tokens": 13637200.0, + "step": 525 + }, + { + "epoch": 0.05776411157478586, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 5.679282188415527, + "learning_rate": 1.9216691068814057e-07, + "loss": 1.2078, + "mean_token_accuracy": 0.6498101949691772, + "num_tokens": 13656718.0, + "step": 526 + }, + { + "epoch": 0.05787392927739952, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 5.007328033447266, + "learning_rate": 1.925329428989751e-07, + "loss": 1.1224, + "mean_token_accuracy": 0.6746991276741028, + "num_tokens": 13680498.0, + "step": 527 + }, + { + "epoch": 0.05798374698001318, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 4.438077449798584, + "learning_rate": 1.9289897510980964e-07, + "loss": 1.0937, + "mean_token_accuracy": 0.6857426762580872, + "num_tokens": 13701339.0, + "step": 528 + }, + { + "epoch": 0.05809356468262684, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.9430174827575684, + "learning_rate": 1.9326500732064422e-07, + "loss": 1.1762, + "mean_token_accuracy": 0.6647309064865112, + "num_tokens": 13724927.0, + "step": 529 + }, + { + "epoch": 0.058203382385240504, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 4.304446697235107, + "learning_rate": 1.9363103953147877e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.68705153465271, + "num_tokens": 13745791.0, + "step": 530 + }, + { + "epoch": 0.058313200087854164, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.8369789123535156, + "learning_rate": 1.9399707174231332e-07, + "loss": 1.1006, + "mean_token_accuracy": 0.6787649393081665, + "num_tokens": 13770891.0, + "step": 531 + }, + { + "epoch": 0.058423017790467824, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 4.2916717529296875, + "learning_rate": 1.9436310395314787e-07, + "loss": 1.126, + "mean_token_accuracy": 0.6673361659049988, + "num_tokens": 13790625.0, + "step": 532 + }, + { + "epoch": 0.058532835493081484, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.675513505935669, + "learning_rate": 1.9472913616398242e-07, + "loss": 1.0909, + "mean_token_accuracy": 0.6783087253570557, + "num_tokens": 13813346.0, + "step": 533 + }, + { + "epoch": 0.058642653195695144, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.5539438724517822, + "learning_rate": 1.95095168374817e-07, + "loss": 1.2052, + "mean_token_accuracy": 0.6556198596954346, + "num_tokens": 13837250.0, + "step": 534 + }, + { + "epoch": 0.05875247089830881, + "ewc_loss": 9.98377799987793e-07, + "grad_norm": 3.950100898742676, + "learning_rate": 1.9546120058565151e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6772207021713257, + "num_tokens": 13865730.0, + "step": 535 + }, + { + "epoch": 0.05886228860092247, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.8472182750701904, + "learning_rate": 1.958272327964861e-07, + "loss": 1.1169, + "mean_token_accuracy": 0.6747922897338867, + "num_tokens": 13893215.0, + "step": 536 + }, + { + "epoch": 0.05897210630353613, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.6001322269439697, + "learning_rate": 1.9619326500732064e-07, + "loss": 1.1667, + "mean_token_accuracy": 0.6646469831466675, + "num_tokens": 13919232.0, + "step": 537 + }, + { + "epoch": 0.05908192400614979, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.6976661682128906, + "learning_rate": 1.9655929721815521e-07, + "loss": 1.1742, + "mean_token_accuracy": 0.6633915901184082, + "num_tokens": 13943661.0, + "step": 538 + }, + { + "epoch": 0.05919174170876345, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 4.41510009765625, + "learning_rate": 1.9692532942898974e-07, + "loss": 1.1773, + "mean_token_accuracy": 0.6567957401275635, + "num_tokens": 13967209.0, + "step": 539 + }, + { + "epoch": 0.05930155941137712, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.9282174110412598, + "learning_rate": 1.9729136163982429e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.6947393417358398, + "num_tokens": 13987948.0, + "step": 540 + }, + { + "epoch": 0.059411377113990776, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.4744045734405518, + "learning_rate": 1.9765739385065886e-07, + "loss": 1.2061, + "mean_token_accuracy": 0.659429669380188, + "num_tokens": 14016862.0, + "step": 541 + }, + { + "epoch": 0.059521194816604436, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.598236083984375, + "learning_rate": 1.980234260614934e-07, + "loss": 1.1897, + "mean_token_accuracy": 0.6634911298751831, + "num_tokens": 14041242.0, + "step": 542 + }, + { + "epoch": 0.059631012519218096, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.551811456680298, + "learning_rate": 1.9838945827232796e-07, + "loss": 1.0917, + "mean_token_accuracy": 0.6849389672279358, + "num_tokens": 14064732.0, + "step": 543 + }, + { + "epoch": 0.059740830221831756, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.9719398021698, + "learning_rate": 1.987554904831625e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6857790946960449, + "num_tokens": 14090186.0, + "step": 544 + }, + { + "epoch": 0.05985064792444542, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.236711263656616, + "learning_rate": 1.9912152269399706e-07, + "loss": 1.2186, + "mean_token_accuracy": 0.654992401599884, + "num_tokens": 14121747.0, + "step": 545 + }, + { + "epoch": 0.05996046562705908, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.2377877235412598, + "learning_rate": 1.9948755490483163e-07, + "loss": 1.2292, + "mean_token_accuracy": 0.6499273777008057, + "num_tokens": 14152053.0, + "step": 546 + }, + { + "epoch": 0.06007028332967274, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 4.222299575805664, + "learning_rate": 1.9985358711566616e-07, + "loss": 1.1282, + "mean_token_accuracy": 0.6797482371330261, + "num_tokens": 14170890.0, + "step": 547 + }, + { + "epoch": 0.0601801010322864, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.2519686222076416, + "learning_rate": 2.0021961932650073e-07, + "loss": 1.1177, + "mean_token_accuracy": 0.6764241456985474, + "num_tokens": 14200001.0, + "step": 548 + }, + { + "epoch": 0.06028991873490007, + "ewc_loss": 1.0132789611816406e-06, + "grad_norm": 3.2414660453796387, + "learning_rate": 2.0058565153733528e-07, + "loss": 1.1573, + "mean_token_accuracy": 0.669847846031189, + "num_tokens": 14225062.0, + "step": 549 + }, + { + "epoch": 0.06039973643751373, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 3.4869539737701416, + "learning_rate": 2.0095168374816983e-07, + "loss": 1.1287, + "mean_token_accuracy": 0.672218918800354, + "num_tokens": 14248169.0, + "step": 550 + }, + { + "epoch": 0.06050955414012739, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 3.6001882553100586, + "learning_rate": 2.0131771595900438e-07, + "loss": 1.1495, + "mean_token_accuracy": 0.6755500435829163, + "num_tokens": 14273309.0, + "step": 551 + }, + { + "epoch": 0.06061937184274105, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 3.4168365001678467, + "learning_rate": 2.0168374816983893e-07, + "loss": 1.1094, + "mean_token_accuracy": 0.6807097792625427, + "num_tokens": 14300494.0, + "step": 552 + }, + { + "epoch": 0.06072918954535471, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 3.1618592739105225, + "learning_rate": 2.020497803806735e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6644985675811768, + "num_tokens": 14327576.0, + "step": 553 + }, + { + "epoch": 0.060839007247968376, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.8082849979400635, + "learning_rate": 2.0241581259150805e-07, + "loss": 1.1678, + "mean_token_accuracy": 0.6604042053222656, + "num_tokens": 14359413.0, + "step": 554 + }, + { + "epoch": 0.060948824950582035, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 3.6624813079833984, + "learning_rate": 2.027818448023426e-07, + "loss": 1.2867, + "mean_token_accuracy": 0.6489380598068237, + "num_tokens": 14389772.0, + "step": 555 + }, + { + "epoch": 0.061058642653195695, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 3.0789546966552734, + "learning_rate": 2.0314787701317715e-07, + "loss": 1.2015, + "mean_token_accuracy": 0.6610209941864014, + "num_tokens": 14416775.0, + "step": 556 + }, + { + "epoch": 0.061168460355809355, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 3.440401792526245, + "learning_rate": 2.035139092240117e-07, + "loss": 1.2417, + "mean_token_accuracy": 0.6431918144226074, + "num_tokens": 14445547.0, + "step": 557 + }, + { + "epoch": 0.061278278058423015, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 3.5007169246673584, + "learning_rate": 2.0387994143484627e-07, + "loss": 1.2135, + "mean_token_accuracy": 0.6501529216766357, + "num_tokens": 14469951.0, + "step": 558 + }, + { + "epoch": 0.06138809576103668, + "ewc_loss": 1.0505318641662598e-06, + "grad_norm": 3.427260160446167, + "learning_rate": 2.042459736456808e-07, + "loss": 1.1732, + "mean_token_accuracy": 0.6661263704299927, + "num_tokens": 14492322.0, + "step": 559 + }, + { + "epoch": 0.06149791346365034, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 3.0197651386260986, + "learning_rate": 2.0461200585651537e-07, + "loss": 1.1639, + "mean_token_accuracy": 0.6671502590179443, + "num_tokens": 14517881.0, + "step": 560 + }, + { + "epoch": 0.061607731166264, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 3.8457391262054443, + "learning_rate": 2.0497803806734992e-07, + "loss": 1.1944, + "mean_token_accuracy": 0.6646150350570679, + "num_tokens": 14543018.0, + "step": 561 + }, + { + "epoch": 0.06171754886887766, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 3.789050579071045, + "learning_rate": 2.0534407027818447e-07, + "loss": 1.1742, + "mean_token_accuracy": 0.6596834063529968, + "num_tokens": 14570418.0, + "step": 562 + }, + { + "epoch": 0.06182736657149132, + "ewc_loss": 1.0728836059570312e-06, + "grad_norm": 4.072644233703613, + "learning_rate": 2.0571010248901902e-07, + "loss": 1.0311, + "mean_token_accuracy": 0.6953915357589722, + "num_tokens": 14589713.0, + "step": 563 + }, + { + "epoch": 0.06193718427410499, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 3.0773284435272217, + "learning_rate": 2.0607613469985357e-07, + "loss": 1.1743, + "mean_token_accuracy": 0.6618099212646484, + "num_tokens": 14617418.0, + "step": 564 + }, + { + "epoch": 0.06204700197671865, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 3.5030081272125244, + "learning_rate": 2.0644216691068814e-07, + "loss": 1.1459, + "mean_token_accuracy": 0.6753825545310974, + "num_tokens": 14645074.0, + "step": 565 + }, + { + "epoch": 0.06215681967933231, + "ewc_loss": 1.0952353477478027e-06, + "grad_norm": 3.2563090324401855, + "learning_rate": 2.068081991215227e-07, + "loss": 1.219, + "mean_token_accuracy": 0.6537748575210571, + "num_tokens": 14671552.0, + "step": 566 + }, + { + "epoch": 0.06226663738194597, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 3.3641271591186523, + "learning_rate": 2.0717423133235724e-07, + "loss": 1.1401, + "mean_token_accuracy": 0.6713603734970093, + "num_tokens": 14694223.0, + "step": 567 + }, + { + "epoch": 0.062376455084559634, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 3.706509590148926, + "learning_rate": 2.075402635431918e-07, + "loss": 1.0876, + "mean_token_accuracy": 0.6908804178237915, + "num_tokens": 14719184.0, + "step": 568 + }, + { + "epoch": 0.062486272787173294, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 3.2486581802368164, + "learning_rate": 2.0790629575402634e-07, + "loss": 1.1463, + "mean_token_accuracy": 0.672248125076294, + "num_tokens": 14742235.0, + "step": 569 + }, + { + "epoch": 0.06259609048978695, + "ewc_loss": 1.1101365089416504e-06, + "grad_norm": 3.2511820793151855, + "learning_rate": 2.0827232796486092e-07, + "loss": 1.2242, + "mean_token_accuracy": 0.6526572704315186, + "num_tokens": 14769348.0, + "step": 570 + }, + { + "epoch": 0.06270590819240061, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 3.50819730758667, + "learning_rate": 2.0863836017569544e-07, + "loss": 1.0548, + "mean_token_accuracy": 0.6911004781723022, + "num_tokens": 14794555.0, + "step": 571 + }, + { + "epoch": 0.06281572589501427, + "ewc_loss": 1.1101365089416504e-06, + "grad_norm": 3.3586106300354004, + "learning_rate": 2.0900439238653001e-07, + "loss": 1.1447, + "mean_token_accuracy": 0.6694039106369019, + "num_tokens": 14817364.0, + "step": 572 + }, + { + "epoch": 0.06292554359762793, + "ewc_loss": 1.1026859283447266e-06, + "grad_norm": 3.8025412559509277, + "learning_rate": 2.0937042459736456e-07, + "loss": 1.1776, + "mean_token_accuracy": 0.6670847535133362, + "num_tokens": 14837472.0, + "step": 573 + }, + { + "epoch": 0.0630353613002416, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 3.007599115371704, + "learning_rate": 2.097364568081991e-07, + "loss": 1.1198, + "mean_token_accuracy": 0.6778228282928467, + "num_tokens": 14865112.0, + "step": 574 + }, + { + "epoch": 0.06314517900285525, + "ewc_loss": 1.1101365089416504e-06, + "grad_norm": 3.0867373943328857, + "learning_rate": 2.1010248901903366e-07, + "loss": 1.156, + "mean_token_accuracy": 0.6624913811683655, + "num_tokens": 14894808.0, + "step": 575 + }, + { + "epoch": 0.06325499670546893, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 3.7876081466674805, + "learning_rate": 2.104685212298682e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6959102749824524, + "num_tokens": 14913195.0, + "step": 576 + }, + { + "epoch": 0.06336481440808259, + "ewc_loss": 1.125037670135498e-06, + "grad_norm": 2.903137445449829, + "learning_rate": 2.1083455344070279e-07, + "loss": 1.1929, + "mean_token_accuracy": 0.6597322225570679, + "num_tokens": 14943929.0, + "step": 577 + }, + { + "epoch": 0.06347463211069625, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.2205140590667725, + "learning_rate": 2.1120058565153734e-07, + "loss": 1.1748, + "mean_token_accuracy": 0.6649990081787109, + "num_tokens": 14969696.0, + "step": 578 + }, + { + "epoch": 0.0635844498133099, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.801084041595459, + "learning_rate": 2.1156661786237186e-07, + "loss": 1.0165, + "mean_token_accuracy": 0.6965669393539429, + "num_tokens": 14989320.0, + "step": 579 + }, + { + "epoch": 0.06369426751592357, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.1361210346221924, + "learning_rate": 2.1193265007320643e-07, + "loss": 1.0817, + "mean_token_accuracy": 0.6841742396354675, + "num_tokens": 15014145.0, + "step": 580 + }, + { + "epoch": 0.06380408521853723, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 2.9297726154327393, + "learning_rate": 2.1229868228404098e-07, + "loss": 1.1615, + "mean_token_accuracy": 0.6612171530723572, + "num_tokens": 15040827.0, + "step": 581 + }, + { + "epoch": 0.06391390292115089, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 2.869544744491577, + "learning_rate": 2.1266471449487556e-07, + "loss": 1.2094, + "mean_token_accuracy": 0.6581337451934814, + "num_tokens": 15069707.0, + "step": 582 + }, + { + "epoch": 0.06402372062376455, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.5320560932159424, + "learning_rate": 2.1303074670571008e-07, + "loss": 1.1466, + "mean_token_accuracy": 0.6675810813903809, + "num_tokens": 15091567.0, + "step": 583 + }, + { + "epoch": 0.0641335383263782, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.302053928375244, + "learning_rate": 2.1339677891654466e-07, + "loss": 1.2216, + "mean_token_accuracy": 0.6526415348052979, + "num_tokens": 15117176.0, + "step": 584 + }, + { + "epoch": 0.06424335602899188, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 2.6902036666870117, + "learning_rate": 2.137628111273792e-07, + "loss": 1.165, + "mean_token_accuracy": 0.6676959991455078, + "num_tokens": 15151656.0, + "step": 585 + }, + { + "epoch": 0.06435317373160554, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.1089894771575928, + "learning_rate": 2.1412884333821375e-07, + "loss": 1.0672, + "mean_token_accuracy": 0.6946052312850952, + "num_tokens": 15176031.0, + "step": 586 + }, + { + "epoch": 0.0644629914342192, + "ewc_loss": 1.1399388313293457e-06, + "grad_norm": 3.2337522506713867, + "learning_rate": 2.144948755490483e-07, + "loss": 1.1235, + "mean_token_accuracy": 0.6690376400947571, + "num_tokens": 15197744.0, + "step": 587 + }, + { + "epoch": 0.06457280913683286, + "ewc_loss": 1.1324882507324219e-06, + "grad_norm": 3.8184468746185303, + "learning_rate": 2.1486090775988285e-07, + "loss": 1.1029, + "mean_token_accuracy": 0.680607795715332, + "num_tokens": 15219377.0, + "step": 588 + }, + { + "epoch": 0.06468262683944652, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 3.1356756687164307, + "learning_rate": 2.1522693997071743e-07, + "loss": 1.1174, + "mean_token_accuracy": 0.6740915179252625, + "num_tokens": 15244196.0, + "step": 589 + }, + { + "epoch": 0.06479244454206018, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.9831387996673584, + "learning_rate": 2.1559297218155198e-07, + "loss": 1.1692, + "mean_token_accuracy": 0.6739146709442139, + "num_tokens": 15273093.0, + "step": 590 + }, + { + "epoch": 0.06490226224467384, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 3.169919490814209, + "learning_rate": 2.159590043923865e-07, + "loss": 1.1255, + "mean_token_accuracy": 0.6765520572662354, + "num_tokens": 15299076.0, + "step": 591 + }, + { + "epoch": 0.0650120799472875, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 3.1748316287994385, + "learning_rate": 2.1632503660322108e-07, + "loss": 1.0442, + "mean_token_accuracy": 0.7003849744796753, + "num_tokens": 15323632.0, + "step": 592 + }, + { + "epoch": 0.06512189764990116, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 2.652538537979126, + "learning_rate": 2.1669106881405563e-07, + "loss": 1.1415, + "mean_token_accuracy": 0.6713597178459167, + "num_tokens": 15356700.0, + "step": 593 + }, + { + "epoch": 0.06523171535251482, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 2.8125758171081543, + "learning_rate": 2.170571010248902e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6968796849250793, + "num_tokens": 15382486.0, + "step": 594 + }, + { + "epoch": 0.06534153305512849, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 3.792725086212158, + "learning_rate": 2.1742313323572472e-07, + "loss": 1.0752, + "mean_token_accuracy": 0.6904494166374207, + "num_tokens": 15403037.0, + "step": 595 + }, + { + "epoch": 0.06545135075774215, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 3.4183435440063477, + "learning_rate": 2.177891654465593e-07, + "loss": 1.071, + "mean_token_accuracy": 0.6899940967559814, + "num_tokens": 15427884.0, + "step": 596 + }, + { + "epoch": 0.06556116846035581, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 2.7794737815856934, + "learning_rate": 2.1815519765739385e-07, + "loss": 1.1444, + "mean_token_accuracy": 0.6662429571151733, + "num_tokens": 15461669.0, + "step": 597 + }, + { + "epoch": 0.06567098616296947, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 3.7980566024780273, + "learning_rate": 2.185212298682284e-07, + "loss": 1.0867, + "mean_token_accuracy": 0.6793921589851379, + "num_tokens": 15484624.0, + "step": 598 + }, + { + "epoch": 0.06578080386558313, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 3.0548524856567383, + "learning_rate": 2.1888726207906295e-07, + "loss": 1.182, + "mean_token_accuracy": 0.6695680618286133, + "num_tokens": 15508390.0, + "step": 599 + }, + { + "epoch": 0.06589062156819679, + "ewc_loss": 1.1920928955078125e-06, + "grad_norm": 3.136972427368164, + "learning_rate": 2.192532942898975e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.679661750793457, + "num_tokens": 15535744.0, + "step": 600 + }, + { + "epoch": 0.06600043927081045, + "ewc_loss": 1.1920928955078125e-06, + "grad_norm": 2.9548184871673584, + "learning_rate": 2.1961932650073207e-07, + "loss": 1.1122, + "mean_token_accuracy": 0.6790259480476379, + "num_tokens": 15565036.0, + "step": 601 + }, + { + "epoch": 0.06611025697342411, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 2.417019844055176, + "learning_rate": 2.1998535871156662e-07, + "loss": 1.2064, + "mean_token_accuracy": 0.6576114892959595, + "num_tokens": 15597806.0, + "step": 602 + }, + { + "epoch": 0.06622007467603777, + "ewc_loss": 1.1920928955078125e-06, + "grad_norm": 3.1942715644836426, + "learning_rate": 2.2035139092240114e-07, + "loss": 1.12, + "mean_token_accuracy": 0.6732602119445801, + "num_tokens": 15623941.0, + "step": 603 + }, + { + "epoch": 0.06632989237865144, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 3.380642890930176, + "learning_rate": 2.2071742313323572e-07, + "loss": 1.1594, + "mean_token_accuracy": 0.6615607142448425, + "num_tokens": 15649843.0, + "step": 604 + }, + { + "epoch": 0.0664397100812651, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 2.945847272872925, + "learning_rate": 2.2108345534407027e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6891932487487793, + "num_tokens": 15677720.0, + "step": 605 + }, + { + "epoch": 0.06654952778387876, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 3.0000956058502197, + "learning_rate": 2.2144948755490484e-07, + "loss": 1.063, + "mean_token_accuracy": 0.6873190402984619, + "num_tokens": 15704251.0, + "step": 606 + }, + { + "epoch": 0.06665934548649242, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 3.518522262573242, + "learning_rate": 2.2181551976573937e-07, + "loss": 1.1424, + "mean_token_accuracy": 0.6676781177520752, + "num_tokens": 15729100.0, + "step": 607 + }, + { + "epoch": 0.06676916318910608, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 3.208937644958496, + "learning_rate": 2.2218155197657394e-07, + "loss": 1.1686, + "mean_token_accuracy": 0.6564560532569885, + "num_tokens": 15753408.0, + "step": 608 + }, + { + "epoch": 0.06687898089171974, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 3.1663615703582764, + "learning_rate": 2.225475841874085e-07, + "loss": 1.0829, + "mean_token_accuracy": 0.6899448037147522, + "num_tokens": 15777400.0, + "step": 609 + }, + { + "epoch": 0.0669887985943334, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 2.862779378890991, + "learning_rate": 2.2291361639824304e-07, + "loss": 1.1969, + "mean_token_accuracy": 0.6635714769363403, + "num_tokens": 15808367.0, + "step": 610 + }, + { + "epoch": 0.06709861629694706, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 3.1893324851989746, + "learning_rate": 2.232796486090776e-07, + "loss": 1.0253, + "mean_token_accuracy": 0.7072858810424805, + "num_tokens": 15831105.0, + "step": 611 + }, + { + "epoch": 0.06720843399956072, + "ewc_loss": 1.2293457984924316e-06, + "grad_norm": 3.1167914867401123, + "learning_rate": 2.2364568081991214e-07, + "loss": 1.1343, + "mean_token_accuracy": 0.6729739904403687, + "num_tokens": 15854816.0, + "step": 612 + }, + { + "epoch": 0.06731825170217438, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 3.4758875370025635, + "learning_rate": 2.240117130307467e-07, + "loss": 1.1224, + "mean_token_accuracy": 0.6750836968421936, + "num_tokens": 15873146.0, + "step": 613 + }, + { + "epoch": 0.06742806940478806, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 2.8320751190185547, + "learning_rate": 2.2437774524158126e-07, + "loss": 1.1707, + "mean_token_accuracy": 0.6682945489883423, + "num_tokens": 15898651.0, + "step": 614 + }, + { + "epoch": 0.06753788710740172, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 3.252211809158325, + "learning_rate": 2.2474377745241578e-07, + "loss": 1.1696, + "mean_token_accuracy": 0.6559016108512878, + "num_tokens": 15921202.0, + "step": 615 + }, + { + "epoch": 0.06764770481001538, + "ewc_loss": 1.2218952178955078e-06, + "grad_norm": 3.252746105194092, + "learning_rate": 2.2510980966325036e-07, + "loss": 1.0188, + "mean_token_accuracy": 0.694692850112915, + "num_tokens": 15940488.0, + "step": 616 + }, + { + "epoch": 0.06775752251262904, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 2.95320725440979, + "learning_rate": 2.254758418740849e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6834671497344971, + "num_tokens": 15967477.0, + "step": 617 + }, + { + "epoch": 0.0678673402152427, + "ewc_loss": 1.2442469596862793e-06, + "grad_norm": 2.863753318786621, + "learning_rate": 2.2584187408491948e-07, + "loss": 1.1193, + "mean_token_accuracy": 0.6741142868995667, + "num_tokens": 15995250.0, + "step": 618 + }, + { + "epoch": 0.06797715791785636, + "ewc_loss": 1.2442469596862793e-06, + "grad_norm": 2.908425807952881, + "learning_rate": 2.26207906295754e-07, + "loss": 1.2304, + "mean_token_accuracy": 0.6528483629226685, + "num_tokens": 16020883.0, + "step": 619 + }, + { + "epoch": 0.06808697562047002, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 2.807766914367676, + "learning_rate": 2.2657393850658856e-07, + "loss": 1.2448, + "mean_token_accuracy": 0.6505585312843323, + "num_tokens": 16047600.0, + "step": 620 + }, + { + "epoch": 0.06819679332308368, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 3.1197214126586914, + "learning_rate": 2.2693997071742313e-07, + "loss": 1.1359, + "mean_token_accuracy": 0.6705152988433838, + "num_tokens": 16070115.0, + "step": 621 + }, + { + "epoch": 0.06830661102569734, + "ewc_loss": 1.2442469596862793e-06, + "grad_norm": 2.8688743114471436, + "learning_rate": 2.2730600292825768e-07, + "loss": 1.1661, + "mean_token_accuracy": 0.6683651804924011, + "num_tokens": 16096102.0, + "step": 622 + }, + { + "epoch": 0.06841642872831101, + "ewc_loss": 1.2442469596862793e-06, + "grad_norm": 2.865269422531128, + "learning_rate": 2.2767203513909223e-07, + "loss": 1.2008, + "mean_token_accuracy": 0.6618033647537231, + "num_tokens": 16122671.0, + "step": 623 + }, + { + "epoch": 0.06852624643092467, + "ewc_loss": 1.259148120880127e-06, + "grad_norm": 3.0877017974853516, + "learning_rate": 2.2803806734992678e-07, + "loss": 1.0512, + "mean_token_accuracy": 0.6865952014923096, + "num_tokens": 16149145.0, + "step": 624 + }, + { + "epoch": 0.06863606413353833, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 2.766433000564575, + "learning_rate": 2.2840409956076135e-07, + "loss": 1.1354, + "mean_token_accuracy": 0.6812993884086609, + "num_tokens": 16178336.0, + "step": 625 + }, + { + "epoch": 0.06874588183615199, + "ewc_loss": 1.259148120880127e-06, + "grad_norm": 3.2724974155426025, + "learning_rate": 2.287701317715959e-07, + "loss": 1.1381, + "mean_token_accuracy": 0.6745977401733398, + "num_tokens": 16200808.0, + "step": 626 + }, + { + "epoch": 0.06885569953876565, + "ewc_loss": 1.259148120880127e-06, + "grad_norm": 2.744478702545166, + "learning_rate": 2.2913616398243043e-07, + "loss": 1.2581, + "mean_token_accuracy": 0.6423154473304749, + "num_tokens": 16228780.0, + "step": 627 + }, + { + "epoch": 0.06896551724137931, + "ewc_loss": 1.259148120880127e-06, + "grad_norm": 3.0429036617279053, + "learning_rate": 2.29502196193265e-07, + "loss": 1.193, + "mean_token_accuracy": 0.6516736745834351, + "num_tokens": 16256783.0, + "step": 628 + }, + { + "epoch": 0.06907533494399297, + "ewc_loss": 1.2740492820739746e-06, + "grad_norm": 2.9607582092285156, + "learning_rate": 2.2986822840409955e-07, + "loss": 1.1675, + "mean_token_accuracy": 0.6585558652877808, + "num_tokens": 16281018.0, + "step": 629 + }, + { + "epoch": 0.06918515264660663, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 3.0288031101226807, + "learning_rate": 2.3023426061493413e-07, + "loss": 1.0624, + "mean_token_accuracy": 0.6891101002693176, + "num_tokens": 16306575.0, + "step": 630 + }, + { + "epoch": 0.06929497034922029, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 2.718404531478882, + "learning_rate": 2.3060029282576865e-07, + "loss": 1.2, + "mean_token_accuracy": 0.6560938358306885, + "num_tokens": 16337454.0, + "step": 631 + }, + { + "epoch": 0.06940478805183395, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 3.141529083251953, + "learning_rate": 2.309663250366032e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.6661427617073059, + "num_tokens": 16360648.0, + "step": 632 + }, + { + "epoch": 0.06951460575444762, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 2.6370849609375, + "learning_rate": 2.3133235724743777e-07, + "loss": 1.1352, + "mean_token_accuracy": 0.6702514886856079, + "num_tokens": 16389325.0, + "step": 633 + }, + { + "epoch": 0.06962442345706128, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 3.2217257022857666, + "learning_rate": 2.3169838945827232e-07, + "loss": 1.1665, + "mean_token_accuracy": 0.6664612293243408, + "num_tokens": 16412500.0, + "step": 634 + }, + { + "epoch": 0.06973424115967494, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 2.3409829139709473, + "learning_rate": 2.3206442166910687e-07, + "loss": 1.2142, + "mean_token_accuracy": 0.6465054154396057, + "num_tokens": 16449528.0, + "step": 635 + }, + { + "epoch": 0.0698440588622886, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.7293753623962402, + "learning_rate": 2.3243045387994142e-07, + "loss": 1.0317, + "mean_token_accuracy": 0.7041391730308533, + "num_tokens": 16475464.0, + "step": 636 + }, + { + "epoch": 0.06995387656490226, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.7363924980163574, + "learning_rate": 2.32796486090776e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.6736732721328735, + "num_tokens": 16503191.0, + "step": 637 + }, + { + "epoch": 0.07006369426751592, + "ewc_loss": 1.3187527656555176e-06, + "grad_norm": 2.6764347553253174, + "learning_rate": 2.3316251830161055e-07, + "loss": 1.1487, + "mean_token_accuracy": 0.664186418056488, + "num_tokens": 16532650.0, + "step": 638 + }, + { + "epoch": 0.07017351197012958, + "ewc_loss": 1.3187527656555176e-06, + "grad_norm": 2.805100917816162, + "learning_rate": 2.3352855051244507e-07, + "loss": 1.2208, + "mean_token_accuracy": 0.6585053205490112, + "num_tokens": 16560255.0, + "step": 639 + }, + { + "epoch": 0.07028332967274324, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.516838312149048, + "learning_rate": 2.3389458272327964e-07, + "loss": 1.1773, + "mean_token_accuracy": 0.6588946580886841, + "num_tokens": 16591859.0, + "step": 640 + }, + { + "epoch": 0.0703931473753569, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.7021708488464355, + "learning_rate": 2.342606149341142e-07, + "loss": 1.1696, + "mean_token_accuracy": 0.665294885635376, + "num_tokens": 16620782.0, + "step": 641 + }, + { + "epoch": 0.07050296507797058, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.8994603157043457, + "learning_rate": 2.3462664714494877e-07, + "loss": 1.177, + "mean_token_accuracy": 0.655299723148346, + "num_tokens": 16648051.0, + "step": 642 + }, + { + "epoch": 0.07061278278058424, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.623753547668457, + "learning_rate": 2.349926793557833e-07, + "loss": 1.2132, + "mean_token_accuracy": 0.6574536561965942, + "num_tokens": 16677294.0, + "step": 643 + }, + { + "epoch": 0.0707226004831979, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.65108585357666, + "learning_rate": 2.3535871156661784e-07, + "loss": 1.2461, + "mean_token_accuracy": 0.6462360620498657, + "num_tokens": 16707122.0, + "step": 644 + }, + { + "epoch": 0.07083241818581155, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.6382899284362793, + "learning_rate": 2.3572474377745242e-07, + "loss": 1.1494, + "mean_token_accuracy": 0.6647434234619141, + "num_tokens": 16735039.0, + "step": 645 + }, + { + "epoch": 0.07094223588842521, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 3.2696192264556885, + "learning_rate": 2.3609077598828696e-07, + "loss": 1.0911, + "mean_token_accuracy": 0.686629056930542, + "num_tokens": 16759433.0, + "step": 646 + }, + { + "epoch": 0.07105205359103887, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 2.9115402698516846, + "learning_rate": 2.3645680819912151e-07, + "loss": 1.0993, + "mean_token_accuracy": 0.6770952343940735, + "num_tokens": 16785559.0, + "step": 647 + }, + { + "epoch": 0.07116187129365253, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 3.294719696044922, + "learning_rate": 2.3682284040995606e-07, + "loss": 1.0835, + "mean_token_accuracy": 0.6844225525856018, + "num_tokens": 16805432.0, + "step": 648 + }, + { + "epoch": 0.0712716889962662, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 3.0460736751556396, + "learning_rate": 2.3718887262079064e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6791435480117798, + "num_tokens": 16828225.0, + "step": 649 + }, + { + "epoch": 0.07138150669887985, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 2.9509689807891846, + "learning_rate": 2.375549048316252e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6762443780899048, + "num_tokens": 16853283.0, + "step": 650 + }, + { + "epoch": 0.07149132440149351, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 2.947587490081787, + "learning_rate": 2.379209370424597e-07, + "loss": 1.1611, + "mean_token_accuracy": 0.6655224561691284, + "num_tokens": 16880661.0, + "step": 651 + }, + { + "epoch": 0.07160114210410719, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 2.960336208343506, + "learning_rate": 2.3828696925329429e-07, + "loss": 1.1996, + "mean_token_accuracy": 0.6555237770080566, + "num_tokens": 16905765.0, + "step": 652 + }, + { + "epoch": 0.07171095980672085, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 3.343071699142456, + "learning_rate": 2.3865300146412886e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.673369288444519, + "num_tokens": 16926151.0, + "step": 653 + }, + { + "epoch": 0.07182077750933451, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 2.730198860168457, + "learning_rate": 2.390190336749634e-07, + "loss": 1.1545, + "mean_token_accuracy": 0.6677789688110352, + "num_tokens": 16955098.0, + "step": 654 + }, + { + "epoch": 0.07193059521194817, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 3.3722143173217773, + "learning_rate": 2.393850658857979e-07, + "loss": 1.0834, + "mean_token_accuracy": 0.6847307682037354, + "num_tokens": 16975049.0, + "step": 655 + }, + { + "epoch": 0.07204041291456183, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 3.5352280139923096, + "learning_rate": 2.397510980966325e-07, + "loss": 1.0082, + "mean_token_accuracy": 0.7015705108642578, + "num_tokens": 16993119.0, + "step": 656 + }, + { + "epoch": 0.07215023061717549, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 2.90600848197937, + "learning_rate": 2.4011713030746706e-07, + "loss": 1.0866, + "mean_token_accuracy": 0.6830500960350037, + "num_tokens": 17016760.0, + "step": 657 + }, + { + "epoch": 0.07226004831978915, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 2.379214286804199, + "learning_rate": 2.404831625183016e-07, + "loss": 1.1486, + "mean_token_accuracy": 0.6637779474258423, + "num_tokens": 17046841.0, + "step": 658 + }, + { + "epoch": 0.0723698660224028, + "ewc_loss": 1.3634562492370605e-06, + "grad_norm": 2.806995391845703, + "learning_rate": 2.4084919472913616e-07, + "loss": 1.2338, + "mean_token_accuracy": 0.6476492881774902, + "num_tokens": 17073782.0, + "step": 659 + }, + { + "epoch": 0.07247968372501647, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.641428232192993, + "learning_rate": 2.412152269399707e-07, + "loss": 1.1708, + "mean_token_accuracy": 0.6669885516166687, + "num_tokens": 17105514.0, + "step": 660 + }, + { + "epoch": 0.07258950142763014, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.9342057704925537, + "learning_rate": 2.4158125915080525e-07, + "loss": 1.1133, + "mean_token_accuracy": 0.6757718324661255, + "num_tokens": 17128183.0, + "step": 661 + }, + { + "epoch": 0.0726993191302438, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 3.1168360710144043, + "learning_rate": 2.419472913616398e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.6695187091827393, + "num_tokens": 17149575.0, + "step": 662 + }, + { + "epoch": 0.07280913683285746, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.8924341201782227, + "learning_rate": 2.4231332357247435e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6865603923797607, + "num_tokens": 17175446.0, + "step": 663 + }, + { + "epoch": 0.07291895453547112, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.6091697216033936, + "learning_rate": 2.426793557833089e-07, + "loss": 1.0843, + "mean_token_accuracy": 0.6826053857803345, + "num_tokens": 17206437.0, + "step": 664 + }, + { + "epoch": 0.07302877223808478, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.71030592918396, + "learning_rate": 2.430453879941435e-07, + "loss": 1.1868, + "mean_token_accuracy": 0.6513446569442749, + "num_tokens": 17232640.0, + "step": 665 + }, + { + "epoch": 0.07313858994069844, + "ewc_loss": 1.385807991027832e-06, + "grad_norm": 2.6221799850463867, + "learning_rate": 2.4341142020497805e-07, + "loss": 1.1745, + "mean_token_accuracy": 0.6635047793388367, + "num_tokens": 17260421.0, + "step": 666 + }, + { + "epoch": 0.0732484076433121, + "ewc_loss": 1.385807991027832e-06, + "grad_norm": 2.76350736618042, + "learning_rate": 2.4377745241581255e-07, + "loss": 0.9775, + "mean_token_accuracy": 0.7118943333625793, + "num_tokens": 17283253.0, + "step": 667 + }, + { + "epoch": 0.07335822534592576, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 3.23046612739563, + "learning_rate": 2.4414348462664715e-07, + "loss": 1.1386, + "mean_token_accuracy": 0.6640775203704834, + "num_tokens": 17303932.0, + "step": 668 + }, + { + "epoch": 0.07346804304853942, + "ewc_loss": 1.4007091522216797e-06, + "grad_norm": 2.9820716381073, + "learning_rate": 2.445095168374817e-07, + "loss": 1.0203, + "mean_token_accuracy": 0.6988592147827148, + "num_tokens": 17324634.0, + "step": 669 + }, + { + "epoch": 0.07357786075115308, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 2.817803144454956, + "learning_rate": 2.4487554904831625e-07, + "loss": 0.9956, + "mean_token_accuracy": 0.7048623561859131, + "num_tokens": 17347927.0, + "step": 670 + }, + { + "epoch": 0.07368767845376675, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 2.777801036834717, + "learning_rate": 2.452415812591508e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6957951784133911, + "num_tokens": 17374596.0, + "step": 671 + }, + { + "epoch": 0.07379749615638041, + "ewc_loss": 1.430511474609375e-06, + "grad_norm": 3.070054292678833, + "learning_rate": 2.4560761346998535e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.6845657825469971, + "num_tokens": 17397546.0, + "step": 672 + }, + { + "epoch": 0.07390731385899407, + "ewc_loss": 1.4379620552062988e-06, + "grad_norm": 3.087644100189209, + "learning_rate": 2.459736456808199e-07, + "loss": 1.2289, + "mean_token_accuracy": 0.6549181342124939, + "num_tokens": 17420746.0, + "step": 673 + }, + { + "epoch": 0.07401713156160773, + "ewc_loss": 1.4379620552062988e-06, + "grad_norm": 3.0559122562408447, + "learning_rate": 2.4633967789165444e-07, + "loss": 1.0818, + "mean_token_accuracy": 0.6910358667373657, + "num_tokens": 17441694.0, + "step": 674 + }, + { + "epoch": 0.07412694926422139, + "ewc_loss": 1.4379620552062988e-06, + "grad_norm": 2.618149518966675, + "learning_rate": 2.46705710102489e-07, + "loss": 1.1576, + "mean_token_accuracy": 0.6694912314414978, + "num_tokens": 17468898.0, + "step": 675 + }, + { + "epoch": 0.07423676696683505, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 3.2069544792175293, + "learning_rate": 2.4707174231332354e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.6926059722900391, + "num_tokens": 17488143.0, + "step": 676 + }, + { + "epoch": 0.07434658466944871, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 2.805389881134033, + "learning_rate": 2.4743777452415815e-07, + "loss": 1.0603, + "mean_token_accuracy": 0.6890392303466797, + "num_tokens": 17514042.0, + "step": 677 + }, + { + "epoch": 0.07445640237206237, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 2.864055871963501, + "learning_rate": 2.478038067349927e-07, + "loss": 1.048, + "mean_token_accuracy": 0.6901083588600159, + "num_tokens": 17538766.0, + "step": 678 + }, + { + "epoch": 0.07456622007467603, + "ewc_loss": 1.4603137969970703e-06, + "grad_norm": 2.549961566925049, + "learning_rate": 2.481698389458272e-07, + "loss": 1.1633, + "mean_token_accuracy": 0.6788822412490845, + "num_tokens": 17567824.0, + "step": 679 + }, + { + "epoch": 0.0746760377772897, + "ewc_loss": 1.4603137969970703e-06, + "grad_norm": 2.481473684310913, + "learning_rate": 2.485358711566618e-07, + "loss": 1.1858, + "mean_token_accuracy": 0.6602445840835571, + "num_tokens": 17599355.0, + "step": 680 + }, + { + "epoch": 0.07478585547990337, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 2.6062934398651123, + "learning_rate": 2.4890190336749634e-07, + "loss": 1.1577, + "mean_token_accuracy": 0.6636682748794556, + "num_tokens": 17625251.0, + "step": 681 + }, + { + "epoch": 0.07489567318251703, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 2.6639654636383057, + "learning_rate": 2.492679355783309e-07, + "loss": 1.1557, + "mean_token_accuracy": 0.6643235683441162, + "num_tokens": 17654107.0, + "step": 682 + }, + { + "epoch": 0.07500549088513069, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 3.0830185413360596, + "learning_rate": 2.4963396778916544e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.6959063410758972, + "num_tokens": 17677403.0, + "step": 683 + }, + { + "epoch": 0.07511530858774434, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 2.522669792175293, + "learning_rate": 2.5e-07, + "loss": 1.1754, + "mean_token_accuracy": 0.6637900471687317, + "num_tokens": 17706274.0, + "step": 684 + }, + { + "epoch": 0.075225126290358, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 2.6417055130004883, + "learning_rate": 2.5036603221083454e-07, + "loss": 1.1484, + "mean_token_accuracy": 0.6584980487823486, + "num_tokens": 17732923.0, + "step": 685 + }, + { + "epoch": 0.07533494399297166, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.8646721839904785, + "learning_rate": 2.507320644216691e-07, + "loss": 1.0922, + "mean_token_accuracy": 0.6851553916931152, + "num_tokens": 17758057.0, + "step": 686 + }, + { + "epoch": 0.07544476169558532, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 2.4617068767547607, + "learning_rate": 2.5109809663250364e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.679391622543335, + "num_tokens": 17787037.0, + "step": 687 + }, + { + "epoch": 0.07555457939819898, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 3.2138075828552246, + "learning_rate": 2.514641288433382e-07, + "loss": 1.1476, + "mean_token_accuracy": 0.6704956293106079, + "num_tokens": 17807531.0, + "step": 688 + }, + { + "epoch": 0.07566439710081264, + "ewc_loss": 1.4975666999816895e-06, + "grad_norm": 3.5882537364959717, + "learning_rate": 2.518301610541728e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6897420883178711, + "num_tokens": 17825910.0, + "step": 689 + }, + { + "epoch": 0.07577421480342632, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 3.088984251022339, + "learning_rate": 2.521961932650073e-07, + "loss": 1.1976, + "mean_token_accuracy": 0.650253415107727, + "num_tokens": 17849674.0, + "step": 690 + }, + { + "epoch": 0.07588403250603998, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.515252113342285, + "learning_rate": 2.525622254758419e-07, + "loss": 1.211, + "mean_token_accuracy": 0.6600900292396545, + "num_tokens": 17884281.0, + "step": 691 + }, + { + "epoch": 0.07599385020865364, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.8077399730682373, + "learning_rate": 2.5292825768667643e-07, + "loss": 1.1265, + "mean_token_accuracy": 0.6751838326454163, + "num_tokens": 17907327.0, + "step": 692 + }, + { + "epoch": 0.0761036679112673, + "ewc_loss": 1.4975666999816895e-06, + "grad_norm": 2.775191307067871, + "learning_rate": 2.5329428989751093e-07, + "loss": 1.1218, + "mean_token_accuracy": 0.6765932440757751, + "num_tokens": 17931212.0, + "step": 693 + }, + { + "epoch": 0.07621348561388096, + "ewc_loss": 1.4975666999816895e-06, + "grad_norm": 2.882786750793457, + "learning_rate": 2.5366032210834553e-07, + "loss": 1.1863, + "mean_token_accuracy": 0.6596498489379883, + "num_tokens": 17957452.0, + "step": 694 + }, + { + "epoch": 0.07632330331649462, + "ewc_loss": 1.5124678611755371e-06, + "grad_norm": 3.6756324768066406, + "learning_rate": 2.540263543191801e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6827738285064697, + "num_tokens": 17974591.0, + "step": 695 + }, + { + "epoch": 0.07643312101910828, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 2.7823495864868164, + "learning_rate": 2.543923865300146e-07, + "loss": 1.1086, + "mean_token_accuracy": 0.6857841610908508, + "num_tokens": 18002528.0, + "step": 696 + }, + { + "epoch": 0.07654293872172194, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 2.816615104675293, + "learning_rate": 2.547584187408492e-07, + "loss": 1.1256, + "mean_token_accuracy": 0.6686936616897583, + "num_tokens": 18027367.0, + "step": 697 + }, + { + "epoch": 0.0766527564243356, + "ewc_loss": 1.519918441772461e-06, + "grad_norm": 2.794100761413574, + "learning_rate": 2.5512445095168373e-07, + "loss": 1.1572, + "mean_token_accuracy": 0.6626451015472412, + "num_tokens": 18052873.0, + "step": 698 + }, + { + "epoch": 0.07676257412694927, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 2.686162233352661, + "learning_rate": 2.5549048316251833e-07, + "loss": 1.144, + "mean_token_accuracy": 0.6656602025032043, + "num_tokens": 18086772.0, + "step": 699 + }, + { + "epoch": 0.07687239182956293, + "ewc_loss": 1.5273690223693848e-06, + "grad_norm": 2.700930118560791, + "learning_rate": 2.5585651537335283e-07, + "loss": 1.1799, + "mean_token_accuracy": 0.6595160961151123, + "num_tokens": 18115267.0, + "step": 700 + }, + { + "epoch": 0.07698220953217659, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 2.705434799194336, + "learning_rate": 2.562225475841874e-07, + "loss": 1.1355, + "mean_token_accuracy": 0.6712997555732727, + "num_tokens": 18144689.0, + "step": 701 + }, + { + "epoch": 0.07709202723479025, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 3.005836248397827, + "learning_rate": 2.56588579795022e-07, + "loss": 1.1467, + "mean_token_accuracy": 0.674110472202301, + "num_tokens": 18168911.0, + "step": 702 + }, + { + "epoch": 0.07720184493740391, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 2.926523208618164, + "learning_rate": 2.569546120058565e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6761605739593506, + "num_tokens": 18193758.0, + "step": 703 + }, + { + "epoch": 0.07731166264001757, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 3.0047402381896973, + "learning_rate": 2.57320644216691e-07, + "loss": 1.1067, + "mean_token_accuracy": 0.6740140318870544, + "num_tokens": 18216265.0, + "step": 704 + }, + { + "epoch": 0.07742148034263123, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 2.6594908237457275, + "learning_rate": 2.576866764275256e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6822424530982971, + "num_tokens": 18240261.0, + "step": 705 + }, + { + "epoch": 0.07753129804524489, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 2.5660018920898438, + "learning_rate": 2.580527086383602e-07, + "loss": 1.2123, + "mean_token_accuracy": 0.6500896215438843, + "num_tokens": 18267819.0, + "step": 706 + }, + { + "epoch": 0.07764111574785855, + "ewc_loss": 1.5422701835632324e-06, + "grad_norm": 2.679333448410034, + "learning_rate": 2.584187408491947e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6781539916992188, + "num_tokens": 18295498.0, + "step": 707 + }, + { + "epoch": 0.07775093345047221, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 2.5650839805603027, + "learning_rate": 2.5878477306002927e-07, + "loss": 1.1521, + "mean_token_accuracy": 0.6615671515464783, + "num_tokens": 18324344.0, + "step": 708 + }, + { + "epoch": 0.07786075115308588, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 3.2674028873443604, + "learning_rate": 2.591508052708638e-07, + "loss": 1.1191, + "mean_token_accuracy": 0.6739200353622437, + "num_tokens": 18345249.0, + "step": 709 + }, + { + "epoch": 0.07797056885569954, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 2.81643009185791, + "learning_rate": 2.5951683748169837e-07, + "loss": 1.2224, + "mean_token_accuracy": 0.6512885689735413, + "num_tokens": 18369976.0, + "step": 710 + }, + { + "epoch": 0.0780803865583132, + "ewc_loss": 1.55717134475708e-06, + "grad_norm": 2.9226932525634766, + "learning_rate": 2.598828696925329e-07, + "loss": 1.0537, + "mean_token_accuracy": 0.6874256730079651, + "num_tokens": 18389828.0, + "step": 711 + }, + { + "epoch": 0.07819020426092686, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.504140853881836, + "learning_rate": 2.6024890190336747e-07, + "loss": 1.1287, + "mean_token_accuracy": 0.6795159578323364, + "num_tokens": 18418228.0, + "step": 712 + }, + { + "epoch": 0.07830002196354052, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.7417638301849365, + "learning_rate": 2.6061493411420207e-07, + "loss": 1.1286, + "mean_token_accuracy": 0.6640219688415527, + "num_tokens": 18442229.0, + "step": 713 + }, + { + "epoch": 0.07840983966615418, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.541116952896118, + "learning_rate": 2.6098096632503657e-07, + "loss": 1.1593, + "mean_token_accuracy": 0.6632442474365234, + "num_tokens": 18469419.0, + "step": 714 + }, + { + "epoch": 0.07851965736876784, + "ewc_loss": 1.5795230865478516e-06, + "grad_norm": 2.6217899322509766, + "learning_rate": 2.6134699853587117e-07, + "loss": 0.9608, + "mean_token_accuracy": 0.7121595144271851, + "num_tokens": 18493762.0, + "step": 715 + }, + { + "epoch": 0.0786294750713815, + "ewc_loss": 1.5795230865478516e-06, + "grad_norm": 3.1003639698028564, + "learning_rate": 2.617130307467057e-07, + "loss": 1.1268, + "mean_token_accuracy": 0.672346293926239, + "num_tokens": 18516052.0, + "step": 716 + }, + { + "epoch": 0.07873929277399516, + "ewc_loss": 1.5795230865478516e-06, + "grad_norm": 2.4841744899749756, + "learning_rate": 2.620790629575402e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6806783080101013, + "num_tokens": 18542974.0, + "step": 717 + }, + { + "epoch": 0.07884911047660884, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 3.2366158962249756, + "learning_rate": 2.624450951683748e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6869064569473267, + "num_tokens": 18561988.0, + "step": 718 + }, + { + "epoch": 0.0789589281792225, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.7148499488830566, + "learning_rate": 2.6281112737920937e-07, + "loss": 1.1724, + "mean_token_accuracy": 0.6626514196395874, + "num_tokens": 18588546.0, + "step": 719 + }, + { + "epoch": 0.07906874588183616, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 2.799156904220581, + "learning_rate": 2.6317715959004386e-07, + "loss": 1.1998, + "mean_token_accuracy": 0.6518329381942749, + "num_tokens": 18615106.0, + "step": 720 + }, + { + "epoch": 0.07917856358444982, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.459735631942749, + "learning_rate": 2.6354319180087846e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6755781769752502, + "num_tokens": 18642433.0, + "step": 721 + }, + { + "epoch": 0.07928838128706348, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.4767494201660156, + "learning_rate": 2.63909224011713e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6739821434020996, + "num_tokens": 18670567.0, + "step": 722 + }, + { + "epoch": 0.07939819898967714, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 3.024322509765625, + "learning_rate": 2.642752562225476e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6701622009277344, + "num_tokens": 18691814.0, + "step": 723 + }, + { + "epoch": 0.0795080166922908, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.6503350734710693, + "learning_rate": 2.646412884333821e-07, + "loss": 1.1553, + "mean_token_accuracy": 0.6660803556442261, + "num_tokens": 18717182.0, + "step": 724 + }, + { + "epoch": 0.07961783439490445, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.433889865875244, + "learning_rate": 2.6500732064421666e-07, + "loss": 1.1352, + "mean_token_accuracy": 0.6645322442054749, + "num_tokens": 18746831.0, + "step": 725 + }, + { + "epoch": 0.07972765209751811, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 2.442485809326172, + "learning_rate": 2.6537335285505126e-07, + "loss": 1.0766, + "mean_token_accuracy": 0.6835814714431763, + "num_tokens": 18777233.0, + "step": 726 + }, + { + "epoch": 0.07983746980013177, + "ewc_loss": 1.6167759895324707e-06, + "grad_norm": 2.7254669666290283, + "learning_rate": 2.6573938506588576e-07, + "loss": 1.1277, + "mean_token_accuracy": 0.6764113903045654, + "num_tokens": 18802479.0, + "step": 727 + }, + { + "epoch": 0.07994728750274545, + "ewc_loss": 1.6167759895324707e-06, + "grad_norm": 2.436002731323242, + "learning_rate": 2.661054172767203e-07, + "loss": 1.1501, + "mean_token_accuracy": 0.6617291569709778, + "num_tokens": 18831630.0, + "step": 728 + }, + { + "epoch": 0.08005710520535911, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 2.906801462173462, + "learning_rate": 2.664714494875549e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.7144986391067505, + "num_tokens": 18853499.0, + "step": 729 + }, + { + "epoch": 0.08016692290797277, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 2.8264975547790527, + "learning_rate": 2.6683748169838946e-07, + "loss": 1.1853, + "mean_token_accuracy": 0.6634581685066223, + "num_tokens": 18875616.0, + "step": 730 + }, + { + "epoch": 0.08027674061058643, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 2.890836715698242, + "learning_rate": 2.67203513909224e-07, + "loss": 1.1113, + "mean_token_accuracy": 0.6778938174247742, + "num_tokens": 18895943.0, + "step": 731 + }, + { + "epoch": 0.08038655831320009, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 2.3626513481140137, + "learning_rate": 2.6756954612005856e-07, + "loss": 1.1681, + "mean_token_accuracy": 0.6624351739883423, + "num_tokens": 18930183.0, + "step": 732 + }, + { + "epoch": 0.08049637601581375, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 2.9746673107147217, + "learning_rate": 2.679355783308931e-07, + "loss": 1.1662, + "mean_token_accuracy": 0.6647497415542603, + "num_tokens": 18953193.0, + "step": 733 + }, + { + "epoch": 0.08060619371842741, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 2.511430501937866, + "learning_rate": 2.6830161054172765e-07, + "loss": 1.1972, + "mean_token_accuracy": 0.6580145955085754, + "num_tokens": 18981251.0, + "step": 734 + }, + { + "epoch": 0.08071601142104107, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 2.523082733154297, + "learning_rate": 2.686676427525622e-07, + "loss": 1.1469, + "mean_token_accuracy": 0.6772140860557556, + "num_tokens": 19009491.0, + "step": 735 + }, + { + "epoch": 0.08082582912365473, + "ewc_loss": 1.6391277313232422e-06, + "grad_norm": 2.4621219635009766, + "learning_rate": 2.6903367496339675e-07, + "loss": 1.0776, + "mean_token_accuracy": 0.6846940517425537, + "num_tokens": 19037182.0, + "step": 736 + }, + { + "epoch": 0.0809356468262684, + "ewc_loss": 1.646578311920166e-06, + "grad_norm": 2.837832450866699, + "learning_rate": 2.693997071742313e-07, + "loss": 1.09, + "mean_token_accuracy": 0.6809585094451904, + "num_tokens": 19059901.0, + "step": 737 + }, + { + "epoch": 0.08104546452888206, + "ewc_loss": 1.646578311920166e-06, + "grad_norm": 2.7212941646575928, + "learning_rate": 2.6976573938506585e-07, + "loss": 1.1736, + "mean_token_accuracy": 0.663803219795227, + "num_tokens": 19085117.0, + "step": 738 + }, + { + "epoch": 0.08115528223149572, + "ewc_loss": 1.646578311920166e-06, + "grad_norm": 2.4721803665161133, + "learning_rate": 2.7013177159590045e-07, + "loss": 1.0806, + "mean_token_accuracy": 0.6817675828933716, + "num_tokens": 19111093.0, + "step": 739 + }, + { + "epoch": 0.08126509993410938, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.7665719985961914, + "learning_rate": 2.70497803806735e-07, + "loss": 1.0945, + "mean_token_accuracy": 0.6752725839614868, + "num_tokens": 19135550.0, + "step": 740 + }, + { + "epoch": 0.08137491763672304, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.5934431552886963, + "learning_rate": 2.708638360175695e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6846751570701599, + "num_tokens": 19162633.0, + "step": 741 + }, + { + "epoch": 0.0814847353393367, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.636645555496216, + "learning_rate": 2.712298682284041e-07, + "loss": 1.0587, + "mean_token_accuracy": 0.6963876485824585, + "num_tokens": 19186813.0, + "step": 742 + }, + { + "epoch": 0.08159455304195036, + "ewc_loss": 1.6614794731140137e-06, + "grad_norm": 2.47805118560791, + "learning_rate": 2.7159590043923865e-07, + "loss": 1.1077, + "mean_token_accuracy": 0.6747453212738037, + "num_tokens": 19217664.0, + "step": 743 + }, + { + "epoch": 0.08170437074456402, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.977264165878296, + "learning_rate": 2.7196193265007315e-07, + "loss": 1.087, + "mean_token_accuracy": 0.6757339239120483, + "num_tokens": 19239127.0, + "step": 744 + }, + { + "epoch": 0.08181418844717768, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.6630921363830566, + "learning_rate": 2.7232796486090775e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6785402297973633, + "num_tokens": 19263028.0, + "step": 745 + }, + { + "epoch": 0.08192400614979134, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 2.406897783279419, + "learning_rate": 2.726939970717423e-07, + "loss": 1.1895, + "mean_token_accuracy": 0.6527125239372253, + "num_tokens": 19292808.0, + "step": 746 + }, + { + "epoch": 0.08203382385240501, + "ewc_loss": 1.6689300537109375e-06, + "grad_norm": 2.792187213897705, + "learning_rate": 2.730600292825769e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.6738349795341492, + "num_tokens": 19314443.0, + "step": 747 + }, + { + "epoch": 0.08214364155501867, + "ewc_loss": 1.6689300537109375e-06, + "grad_norm": 2.7091259956359863, + "learning_rate": 2.734260614934114e-07, + "loss": 1.0692, + "mean_token_accuracy": 0.6905237436294556, + "num_tokens": 19339945.0, + "step": 748 + }, + { + "epoch": 0.08225345925763233, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 2.4583194255828857, + "learning_rate": 2.7379209370424594e-07, + "loss": 1.188, + "mean_token_accuracy": 0.6584681868553162, + "num_tokens": 19370487.0, + "step": 749 + }, + { + "epoch": 0.08236327696024599, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 2.5581085681915283, + "learning_rate": 2.7415812591508055e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6802638173103333, + "num_tokens": 19395644.0, + "step": 750 + }, + { + "epoch": 0.08247309466285965, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 2.2699086666107178, + "learning_rate": 2.7452415812591504e-07, + "loss": 1.1703, + "mean_token_accuracy": 0.6600353717803955, + "num_tokens": 19426218.0, + "step": 751 + }, + { + "epoch": 0.08258291236547331, + "ewc_loss": 1.6838312149047852e-06, + "grad_norm": 2.616651773452759, + "learning_rate": 2.748901903367496e-07, + "loss": 1.1678, + "mean_token_accuracy": 0.6649764776229858, + "num_tokens": 19451846.0, + "step": 752 + }, + { + "epoch": 0.08269273006808697, + "ewc_loss": 1.6838312149047852e-06, + "grad_norm": 2.5904946327209473, + "learning_rate": 2.752562225475842e-07, + "loss": 1.0901, + "mean_token_accuracy": 0.6858956813812256, + "num_tokens": 19476296.0, + "step": 753 + }, + { + "epoch": 0.08280254777070063, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 2.4619643688201904, + "learning_rate": 2.756222547584187e-07, + "loss": 1.1304, + "mean_token_accuracy": 0.6788091063499451, + "num_tokens": 19504200.0, + "step": 754 + }, + { + "epoch": 0.08291236547331429, + "ewc_loss": 1.691281795501709e-06, + "grad_norm": 2.772606372833252, + "learning_rate": 2.759882869692533e-07, + "loss": 1.2079, + "mean_token_accuracy": 0.6554490327835083, + "num_tokens": 19527134.0, + "step": 755 + }, + { + "epoch": 0.08302218317592797, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 2.4479124546051025, + "learning_rate": 2.7635431918008784e-07, + "loss": 1.0916, + "mean_token_accuracy": 0.6828716397285461, + "num_tokens": 19556429.0, + "step": 756 + }, + { + "epoch": 0.08313200087854163, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 2.253976583480835, + "learning_rate": 2.767203513909224e-07, + "loss": 1.1645, + "mean_token_accuracy": 0.6616703867912292, + "num_tokens": 19587759.0, + "step": 757 + }, + { + "epoch": 0.08324181858115529, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 3.1203203201293945, + "learning_rate": 2.7708638360175694e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6815999746322632, + "num_tokens": 19607412.0, + "step": 758 + }, + { + "epoch": 0.08335163628376895, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 2.7141101360321045, + "learning_rate": 2.774524158125915e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.7097141742706299, + "num_tokens": 19630184.0, + "step": 759 + }, + { + "epoch": 0.0834614539863826, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 2.667429208755493, + "learning_rate": 2.7781844802342604e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.681110143661499, + "num_tokens": 19655787.0, + "step": 760 + }, + { + "epoch": 0.08357127168899627, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 2.4372265338897705, + "learning_rate": 2.781844802342606e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6919885873794556, + "num_tokens": 19683813.0, + "step": 761 + }, + { + "epoch": 0.08368108939160993, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 2.8961477279663086, + "learning_rate": 2.7855051244509513e-07, + "loss": 1.0852, + "mean_token_accuracy": 0.6806028485298157, + "num_tokens": 19706512.0, + "step": 762 + }, + { + "epoch": 0.08379090709422359, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 2.3725688457489014, + "learning_rate": 2.7891654465592974e-07, + "loss": 1.1547, + "mean_token_accuracy": 0.6703260540962219, + "num_tokens": 19737097.0, + "step": 763 + }, + { + "epoch": 0.08390072479683724, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 2.6041314601898193, + "learning_rate": 2.792825768667643e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.7023797631263733, + "num_tokens": 19762815.0, + "step": 764 + }, + { + "epoch": 0.0840105424994509, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 2.796541929244995, + "learning_rate": 2.796486090775988e-07, + "loss": 1.1227, + "mean_token_accuracy": 0.6748864054679871, + "num_tokens": 19785444.0, + "step": 765 + }, + { + "epoch": 0.08412036020206458, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.742948055267334, + "learning_rate": 2.800146412884334e-07, + "loss": 1.1299, + "mean_token_accuracy": 0.6711357831954956, + "num_tokens": 19809678.0, + "step": 766 + }, + { + "epoch": 0.08423017790467824, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.561150550842285, + "learning_rate": 2.8038067349926793e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.6744384169578552, + "num_tokens": 19834658.0, + "step": 767 + }, + { + "epoch": 0.0843399956072919, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.5120749473571777, + "learning_rate": 2.8074670571010243e-07, + "loss": 1.1899, + "mean_token_accuracy": 0.6608452200889587, + "num_tokens": 19860076.0, + "step": 768 + }, + { + "epoch": 0.08444981330990556, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.786416530609131, + "learning_rate": 2.8111273792093703e-07, + "loss": 1.1442, + "mean_token_accuracy": 0.6680145263671875, + "num_tokens": 19885069.0, + "step": 769 + }, + { + "epoch": 0.08455963101251922, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.8274199962615967, + "learning_rate": 2.814787701317716e-07, + "loss": 1.2052, + "mean_token_accuracy": 0.660765528678894, + "num_tokens": 19909921.0, + "step": 770 + }, + { + "epoch": 0.08466944871513288, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.578254222869873, + "learning_rate": 2.818448023426062e-07, + "loss": 1.0604, + "mean_token_accuracy": 0.6968839168548584, + "num_tokens": 19935788.0, + "step": 771 + }, + { + "epoch": 0.08477926641774654, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.3883883953094482, + "learning_rate": 2.822108345534407e-07, + "loss": 1.1375, + "mean_token_accuracy": 0.667554497718811, + "num_tokens": 19965350.0, + "step": 772 + }, + { + "epoch": 0.0848890841203602, + "ewc_loss": 1.735985279083252e-06, + "grad_norm": 2.815683126449585, + "learning_rate": 2.8257686676427523e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.704465389251709, + "num_tokens": 19985991.0, + "step": 773 + }, + { + "epoch": 0.08499890182297386, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.4967567920684814, + "learning_rate": 2.8294289897510983e-07, + "loss": 1.089, + "mean_token_accuracy": 0.6836088299751282, + "num_tokens": 20013789.0, + "step": 774 + }, + { + "epoch": 0.08510871952558753, + "ewc_loss": 1.7285346984863281e-06, + "grad_norm": 2.8509514331817627, + "learning_rate": 2.833089311859443e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.6932339668273926, + "num_tokens": 20034001.0, + "step": 775 + }, + { + "epoch": 0.08521853722820119, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 2.6399872303009033, + "learning_rate": 2.836749633967789e-07, + "loss": 1.1498, + "mean_token_accuracy": 0.6641720533370972, + "num_tokens": 20060222.0, + "step": 776 + }, + { + "epoch": 0.08532835493081485, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 2.3730199337005615, + "learning_rate": 2.840409956076135e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6835694313049316, + "num_tokens": 20093804.0, + "step": 777 + }, + { + "epoch": 0.08543817263342851, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 2.3098089694976807, + "learning_rate": 2.8440702781844797e-07, + "loss": 1.2213, + "mean_token_accuracy": 0.6427419185638428, + "num_tokens": 20126639.0, + "step": 778 + }, + { + "epoch": 0.08554799033604217, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 2.5459303855895996, + "learning_rate": 2.847730600292826e-07, + "loss": 1.0188, + "mean_token_accuracy": 0.7048238515853882, + "num_tokens": 20154205.0, + "step": 779 + }, + { + "epoch": 0.08565780803865583, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 2.331458330154419, + "learning_rate": 2.851390922401171e-07, + "loss": 1.2116, + "mean_token_accuracy": 0.6642403602600098, + "num_tokens": 20185261.0, + "step": 780 + }, + { + "epoch": 0.08576762574126949, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.884675979614258, + "learning_rate": 2.8550512445095167e-07, + "loss": 1.0133, + "mean_token_accuracy": 0.6952124834060669, + "num_tokens": 20205777.0, + "step": 781 + }, + { + "epoch": 0.08587744344388315, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.30430006980896, + "learning_rate": 2.858711566617862e-07, + "loss": 1.0579, + "mean_token_accuracy": 0.6894548535346985, + "num_tokens": 20237386.0, + "step": 782 + }, + { + "epoch": 0.08598726114649681, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.603508710861206, + "learning_rate": 2.8623718887262077e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6774588823318481, + "num_tokens": 20261696.0, + "step": 783 + }, + { + "epoch": 0.08609707884911047, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.5102250576019287, + "learning_rate": 2.866032210834553e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.6808676719665527, + "num_tokens": 20287351.0, + "step": 784 + }, + { + "epoch": 0.08620689655172414, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 2.700124502182007, + "learning_rate": 2.8696925329428987e-07, + "loss": 1.0048, + "mean_token_accuracy": 0.7024551630020142, + "num_tokens": 20309391.0, + "step": 785 + }, + { + "epoch": 0.0863167142543378, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 2.7079896926879883, + "learning_rate": 2.873352855051244e-07, + "loss": 1.0641, + "mean_token_accuracy": 0.6822109222412109, + "num_tokens": 20332515.0, + "step": 786 + }, + { + "epoch": 0.08642653195695146, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 2.634598731994629, + "learning_rate": 2.87701317715959e-07, + "loss": 1.102, + "mean_token_accuracy": 0.6856441497802734, + "num_tokens": 20359379.0, + "step": 787 + }, + { + "epoch": 0.08653634965956512, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 2.7004830837249756, + "learning_rate": 2.8806734992679357e-07, + "loss": 1.1319, + "mean_token_accuracy": 0.6664392948150635, + "num_tokens": 20383436.0, + "step": 788 + }, + { + "epoch": 0.08664616736217878, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 2.272614002227783, + "learning_rate": 2.8843338213762807e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6759164333343506, + "num_tokens": 20414443.0, + "step": 789 + }, + { + "epoch": 0.08675598506479244, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 2.822556734085083, + "learning_rate": 2.8879941434846267e-07, + "loss": 1.0696, + "mean_token_accuracy": 0.680791974067688, + "num_tokens": 20437080.0, + "step": 790 + }, + { + "epoch": 0.0868658027674061, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.627904176712036, + "learning_rate": 2.891654465592972e-07, + "loss": 0.9665, + "mean_token_accuracy": 0.7151522040367126, + "num_tokens": 20461294.0, + "step": 791 + }, + { + "epoch": 0.08697562047001976, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.582376718521118, + "learning_rate": 2.895314787701317e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6729703545570374, + "num_tokens": 20486377.0, + "step": 792 + }, + { + "epoch": 0.08708543817263342, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 3.01686692237854, + "learning_rate": 2.898975109809663e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.6896916031837463, + "num_tokens": 20508898.0, + "step": 793 + }, + { + "epoch": 0.0871952558752471, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.307255983352661, + "learning_rate": 2.9026354319180086e-07, + "loss": 1.136, + "mean_token_accuracy": 0.6699768900871277, + "num_tokens": 20538441.0, + "step": 794 + }, + { + "epoch": 0.08730507357786076, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 2.560030460357666, + "learning_rate": 2.9062957540263547e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6839860677719116, + "num_tokens": 20561412.0, + "step": 795 + }, + { + "epoch": 0.08741489128047442, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 2.7246711254119873, + "learning_rate": 2.9099560761346996e-07, + "loss": 1.0829, + "mean_token_accuracy": 0.681331217288971, + "num_tokens": 20586226.0, + "step": 796 + }, + { + "epoch": 0.08752470898308808, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.2823452949523926, + "learning_rate": 2.913616398243045e-07, + "loss": 1.206, + "mean_token_accuracy": 0.6539856195449829, + "num_tokens": 20618182.0, + "step": 797 + }, + { + "epoch": 0.08763452668570174, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.428748846054077, + "learning_rate": 2.917276720351391e-07, + "loss": 1.1703, + "mean_token_accuracy": 0.6708976030349731, + "num_tokens": 20647672.0, + "step": 798 + }, + { + "epoch": 0.0877443443883154, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 2.6091837882995605, + "learning_rate": 2.920937042459736e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6885460615158081, + "num_tokens": 20671724.0, + "step": 799 + }, + { + "epoch": 0.08785416209092906, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 2.35172963142395, + "learning_rate": 2.9245973645680816e-07, + "loss": 1.1499, + "mean_token_accuracy": 0.682934045791626, + "num_tokens": 20700193.0, + "step": 800 + }, + { + "epoch": 0.08796397979354272, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 2.699875831604004, + "learning_rate": 2.9282576866764276e-07, + "loss": 1.0752, + "mean_token_accuracy": 0.6814903020858765, + "num_tokens": 20724905.0, + "step": 801 + }, + { + "epoch": 0.08807379749615638, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 2.582691192626953, + "learning_rate": 2.9319180087847726e-07, + "loss": 1.134, + "mean_token_accuracy": 0.6703799962997437, + "num_tokens": 20749749.0, + "step": 802 + }, + { + "epoch": 0.08818361519877004, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 2.354644298553467, + "learning_rate": 2.9355783308931186e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.697622537612915, + "num_tokens": 20778139.0, + "step": 803 + }, + { + "epoch": 0.08829343290138371, + "ewc_loss": 1.8402934074401855e-06, + "grad_norm": 2.5546939373016357, + "learning_rate": 2.939238653001464e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.68697190284729, + "num_tokens": 20801934.0, + "step": 804 + }, + { + "epoch": 0.08840325060399737, + "ewc_loss": 1.8402934074401855e-06, + "grad_norm": 2.4096357822418213, + "learning_rate": 2.9428989751098096e-07, + "loss": 1.118, + "mean_token_accuracy": 0.6760761141777039, + "num_tokens": 20832336.0, + "step": 805 + }, + { + "epoch": 0.08851306830661103, + "ewc_loss": 1.8402934074401855e-06, + "grad_norm": 2.499826192855835, + "learning_rate": 2.946559297218155e-07, + "loss": 1.1219, + "mean_token_accuracy": 0.6702616214752197, + "num_tokens": 20858646.0, + "step": 806 + }, + { + "epoch": 0.08862288600922469, + "ewc_loss": 1.8477439880371094e-06, + "grad_norm": 2.4255967140197754, + "learning_rate": 2.9502196193265006e-07, + "loss": 1.1463, + "mean_token_accuracy": 0.6674537658691406, + "num_tokens": 20887602.0, + "step": 807 + }, + { + "epoch": 0.08873270371183835, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 2.670570135116577, + "learning_rate": 2.953879941434846e-07, + "loss": 1.1716, + "mean_token_accuracy": 0.6587715148925781, + "num_tokens": 20912488.0, + "step": 808 + }, + { + "epoch": 0.08884252141445201, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 2.3612723350524902, + "learning_rate": 2.9575402635431915e-07, + "loss": 1.1945, + "mean_token_accuracy": 0.6527847051620483, + "num_tokens": 20940624.0, + "step": 809 + }, + { + "epoch": 0.08895233911706567, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 2.30556583404541, + "learning_rate": 2.961200585651537e-07, + "loss": 1.0891, + "mean_token_accuracy": 0.6822469234466553, + "num_tokens": 20970873.0, + "step": 810 + }, + { + "epoch": 0.08906215681967933, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 2.711721658706665, + "learning_rate": 2.964860907759883e-07, + "loss": 1.0283, + "mean_token_accuracy": 0.6948552131652832, + "num_tokens": 20993869.0, + "step": 811 + }, + { + "epoch": 0.08917197452229299, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.4627461433410645, + "learning_rate": 2.9685212298682285e-07, + "loss": 1.0337, + "mean_token_accuracy": 0.69328773021698, + "num_tokens": 21019651.0, + "step": 812 + }, + { + "epoch": 0.08928179222490666, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.309415102005005, + "learning_rate": 2.9721815519765735e-07, + "loss": 1.0654, + "mean_token_accuracy": 0.6891283988952637, + "num_tokens": 21048736.0, + "step": 813 + }, + { + "epoch": 0.08939160992752032, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.7386419773101807, + "learning_rate": 2.9758418740849195e-07, + "loss": 1.1957, + "mean_token_accuracy": 0.6659548282623291, + "num_tokens": 21071505.0, + "step": 814 + }, + { + "epoch": 0.08950142763013398, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.8001997470855713, + "learning_rate": 2.979502196193265e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.6972644329071045, + "num_tokens": 21094803.0, + "step": 815 + }, + { + "epoch": 0.08961124533274764, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.623490333557129, + "learning_rate": 2.98316251830161e-07, + "loss": 1.103, + "mean_token_accuracy": 0.6813284158706665, + "num_tokens": 21120328.0, + "step": 816 + }, + { + "epoch": 0.0897210630353613, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.470658779144287, + "learning_rate": 2.986822840409956e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6710608005523682, + "num_tokens": 21146162.0, + "step": 817 + }, + { + "epoch": 0.08983088073797496, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 2.5771126747131348, + "learning_rate": 2.9904831625183015e-07, + "loss": 1.0493, + "mean_token_accuracy": 0.6882209181785583, + "num_tokens": 21169041.0, + "step": 818 + }, + { + "epoch": 0.08994069844058862, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.480741024017334, + "learning_rate": 2.994143484626647e-07, + "loss": 1.1022, + "mean_token_accuracy": 0.6818432807922363, + "num_tokens": 21195693.0, + "step": 819 + }, + { + "epoch": 0.09005051614320228, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.8070075511932373, + "learning_rate": 2.9978038067349925e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6913570761680603, + "num_tokens": 21217125.0, + "step": 820 + }, + { + "epoch": 0.09016033384581594, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.5195422172546387, + "learning_rate": 3.001464128843338e-07, + "loss": 1.1658, + "mean_token_accuracy": 0.6589032411575317, + "num_tokens": 21243201.0, + "step": 821 + }, + { + "epoch": 0.0902701515484296, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 2.3472251892089844, + "learning_rate": 3.005124450951684e-07, + "loss": 1.0684, + "mean_token_accuracy": 0.6904187202453613, + "num_tokens": 21272069.0, + "step": 822 + }, + { + "epoch": 0.09037996925104327, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.4693853855133057, + "learning_rate": 3.008784773060029e-07, + "loss": 1.1543, + "mean_token_accuracy": 0.6648796796798706, + "num_tokens": 21299739.0, + "step": 823 + }, + { + "epoch": 0.09048978695365693, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 3.0269815921783447, + "learning_rate": 3.0124450951683744e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6767041683197021, + "num_tokens": 21318516.0, + "step": 824 + }, + { + "epoch": 0.0905996046562706, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 2.8828041553497314, + "learning_rate": 3.0161054172767204e-07, + "loss": 1.0849, + "mean_token_accuracy": 0.6741046905517578, + "num_tokens": 21337893.0, + "step": 825 + }, + { + "epoch": 0.09070942235888425, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 2.4314301013946533, + "learning_rate": 3.0197657393850654e-07, + "loss": 1.2002, + "mean_token_accuracy": 0.6472905874252319, + "num_tokens": 21366728.0, + "step": 826 + }, + { + "epoch": 0.09081924006149791, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 2.317101240158081, + "learning_rate": 3.0234260614934114e-07, + "loss": 1.1909, + "mean_token_accuracy": 0.650955319404602, + "num_tokens": 21398742.0, + "step": 827 + }, + { + "epoch": 0.09092905776411157, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.628338575363159, + "learning_rate": 3.027086383601757e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7041237354278564, + "num_tokens": 21422256.0, + "step": 828 + }, + { + "epoch": 0.09103887546672523, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 2.5495967864990234, + "learning_rate": 3.0307467057101024e-07, + "loss": 1.1313, + "mean_token_accuracy": 0.6716092824935913, + "num_tokens": 21445882.0, + "step": 829 + }, + { + "epoch": 0.09114869316933889, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 2.4980947971343994, + "learning_rate": 3.034407027818448e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.6733561158180237, + "num_tokens": 21472863.0, + "step": 830 + }, + { + "epoch": 0.09125851087195255, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 2.824666738510132, + "learning_rate": 3.0380673499267934e-07, + "loss": 1.0454, + "mean_token_accuracy": 0.6914240717887878, + "num_tokens": 21496899.0, + "step": 831 + }, + { + "epoch": 0.09136832857456623, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 2.7989187240600586, + "learning_rate": 3.041727672035139e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.674737811088562, + "num_tokens": 21519107.0, + "step": 832 + }, + { + "epoch": 0.09147814627717989, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 2.6344990730285645, + "learning_rate": 3.0453879941434844e-07, + "loss": 1.1554, + "mean_token_accuracy": 0.6645640730857849, + "num_tokens": 21544059.0, + "step": 833 + }, + { + "epoch": 0.09158796397979355, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 2.65219783782959, + "learning_rate": 3.04904831625183e-07, + "loss": 1.079, + "mean_token_accuracy": 0.6875962018966675, + "num_tokens": 21568468.0, + "step": 834 + }, + { + "epoch": 0.0916977816824072, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.5776731967926025, + "learning_rate": 3.052708638360176e-07, + "loss": 1.0966, + "mean_token_accuracy": 0.673718273639679, + "num_tokens": 21592891.0, + "step": 835 + }, + { + "epoch": 0.09180759938502087, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.8914341926574707, + "learning_rate": 3.056368960468521e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6829721927642822, + "num_tokens": 21614569.0, + "step": 836 + }, + { + "epoch": 0.09191741708763453, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.658931255340576, + "learning_rate": 3.0600292825768663e-07, + "loss": 1.1562, + "mean_token_accuracy": 0.6574735045433044, + "num_tokens": 21638089.0, + "step": 837 + }, + { + "epoch": 0.09202723479024819, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.165301561355591, + "learning_rate": 3.0636896046852124e-07, + "loss": 1.1014, + "mean_token_accuracy": 0.6873199939727783, + "num_tokens": 21668891.0, + "step": 838 + }, + { + "epoch": 0.09213705249286185, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.467811346054077, + "learning_rate": 3.067349926793558e-07, + "loss": 1.1526, + "mean_token_accuracy": 0.6684216260910034, + "num_tokens": 21696713.0, + "step": 839 + }, + { + "epoch": 0.0922468701954755, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.425135850906372, + "learning_rate": 3.071010248901903e-07, + "loss": 1.0145, + "mean_token_accuracy": 0.7030385732650757, + "num_tokens": 21724588.0, + "step": 840 + }, + { + "epoch": 0.09235668789808917, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 3.0246078968048096, + "learning_rate": 3.074670571010249e-07, + "loss": 1.0773, + "mean_token_accuracy": 0.693432092666626, + "num_tokens": 21745178.0, + "step": 841 + }, + { + "epoch": 0.09246650560070284, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.5973739624023438, + "learning_rate": 3.0783308931185943e-07, + "loss": 1.0765, + "mean_token_accuracy": 0.6892521381378174, + "num_tokens": 21768870.0, + "step": 842 + }, + { + "epoch": 0.0925763233033165, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.4139463901519775, + "learning_rate": 3.08199121522694e-07, + "loss": 1.0623, + "mean_token_accuracy": 0.686601996421814, + "num_tokens": 21796552.0, + "step": 843 + }, + { + "epoch": 0.09268614100593016, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.5702412128448486, + "learning_rate": 3.0856515373352853e-07, + "loss": 1.1344, + "mean_token_accuracy": 0.6727425456047058, + "num_tokens": 21825989.0, + "step": 844 + }, + { + "epoch": 0.09279595870854382, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.67399263381958, + "learning_rate": 3.089311859443631e-07, + "loss": 1.0415, + "mean_token_accuracy": 0.6918174028396606, + "num_tokens": 21848075.0, + "step": 845 + }, + { + "epoch": 0.09290577641115748, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.368196487426758, + "learning_rate": 3.092972181551977e-07, + "loss": 1.138, + "mean_token_accuracy": 0.6728766560554504, + "num_tokens": 21877666.0, + "step": 846 + }, + { + "epoch": 0.09301559411377114, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 2.3239784240722656, + "learning_rate": 3.096632503660322e-07, + "loss": 1.0872, + "mean_token_accuracy": 0.6768977642059326, + "num_tokens": 21904463.0, + "step": 847 + }, + { + "epoch": 0.0931254118163848, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.6087827682495117, + "learning_rate": 3.1002928257686673e-07, + "loss": 1.0628, + "mean_token_accuracy": 0.6818196177482605, + "num_tokens": 21927852.0, + "step": 848 + }, + { + "epoch": 0.09323522951899846, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.3992674350738525, + "learning_rate": 3.1039531478770133e-07, + "loss": 1.058, + "mean_token_accuracy": 0.6952270269393921, + "num_tokens": 21956317.0, + "step": 849 + }, + { + "epoch": 0.09334504722161212, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.5152902603149414, + "learning_rate": 3.107613469985358e-07, + "loss": 1.0587, + "mean_token_accuracy": 0.6835290193557739, + "num_tokens": 21981920.0, + "step": 850 + }, + { + "epoch": 0.09345486492422579, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.813537120819092, + "learning_rate": 3.1112737920937043e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.6710417866706848, + "num_tokens": 22006163.0, + "step": 851 + }, + { + "epoch": 0.09356468262683945, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.633270025253296, + "learning_rate": 3.11493411420205e-07, + "loss": 1.0921, + "mean_token_accuracy": 0.6761773228645325, + "num_tokens": 22029906.0, + "step": 852 + }, + { + "epoch": 0.09367450032945311, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.5643222332000732, + "learning_rate": 3.1185944363103947e-07, + "loss": 1.1315, + "mean_token_accuracy": 0.6743578910827637, + "num_tokens": 22055611.0, + "step": 853 + }, + { + "epoch": 0.09378431803206677, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.4296486377716064, + "learning_rate": 3.122254758418741e-07, + "loss": 1.0802, + "mean_token_accuracy": 0.6748353242874146, + "num_tokens": 22081614.0, + "step": 854 + }, + { + "epoch": 0.09389413573468043, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 2.641274929046631, + "learning_rate": 3.125915080527086e-07, + "loss": 1.0637, + "mean_token_accuracy": 0.6821235418319702, + "num_tokens": 22104956.0, + "step": 855 + }, + { + "epoch": 0.09400395343729409, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.2715747356414795, + "learning_rate": 3.1295754026354317e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6718306541442871, + "num_tokens": 22135119.0, + "step": 856 + }, + { + "epoch": 0.09411377113990775, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.7317681312561035, + "learning_rate": 3.133235724743777e-07, + "loss": 1.1262, + "mean_token_accuracy": 0.676184892654419, + "num_tokens": 22157625.0, + "step": 857 + }, + { + "epoch": 0.09422358884252141, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.285090684890747, + "learning_rate": 3.1368960468521227e-07, + "loss": 1.0997, + "mean_token_accuracy": 0.681556224822998, + "num_tokens": 22188178.0, + "step": 858 + }, + { + "epoch": 0.09433340654513507, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.381847858428955, + "learning_rate": 3.1405563689604687e-07, + "loss": 1.1022, + "mean_token_accuracy": 0.6737164258956909, + "num_tokens": 22213762.0, + "step": 859 + }, + { + "epoch": 0.09444322424774873, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.7211904525756836, + "learning_rate": 3.1442166910688137e-07, + "loss": 1.1093, + "mean_token_accuracy": 0.6764949560165405, + "num_tokens": 22235680.0, + "step": 860 + }, + { + "epoch": 0.0945530419503624, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.6787400245666504, + "learning_rate": 3.147877013177159e-07, + "loss": 1.0385, + "mean_token_accuracy": 0.6928540468215942, + "num_tokens": 22258638.0, + "step": 861 + }, + { + "epoch": 0.09466285965297606, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.570228099822998, + "learning_rate": 3.151537335285505e-07, + "loss": 1.0809, + "mean_token_accuracy": 0.682483971118927, + "num_tokens": 22282129.0, + "step": 862 + }, + { + "epoch": 0.09477267735558972, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.2248353958129883, + "learning_rate": 3.1551976573938507e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.7024192214012146, + "num_tokens": 22312222.0, + "step": 863 + }, + { + "epoch": 0.09488249505820338, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.8050851821899414, + "learning_rate": 3.1588579795021956e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6752294898033142, + "num_tokens": 22335476.0, + "step": 864 + }, + { + "epoch": 0.09499231276081704, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.439523935317993, + "learning_rate": 3.1625183016105417e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.6846098899841309, + "num_tokens": 22362628.0, + "step": 865 + }, + { + "epoch": 0.0951021304634307, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.6216201782226562, + "learning_rate": 3.166178623718887e-07, + "loss": 1.1491, + "mean_token_accuracy": 0.6690148115158081, + "num_tokens": 22388402.0, + "step": 866 + }, + { + "epoch": 0.09521194816604436, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 2.4852118492126465, + "learning_rate": 3.1698389458272327e-07, + "loss": 1.1287, + "mean_token_accuracy": 0.6765002608299255, + "num_tokens": 22417436.0, + "step": 867 + }, + { + "epoch": 0.09532176586865802, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 2.766521692276001, + "learning_rate": 3.173499267935578e-07, + "loss": 1.156, + "mean_token_accuracy": 0.662887454032898, + "num_tokens": 22440356.0, + "step": 868 + }, + { + "epoch": 0.09543158357127168, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 2.8358185291290283, + "learning_rate": 3.1771595900439236e-07, + "loss": 1.1515, + "mean_token_accuracy": 0.6724884510040283, + "num_tokens": 22462445.0, + "step": 869 + }, + { + "epoch": 0.09554140127388536, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.2460947036743164, + "learning_rate": 3.1808199121522697e-07, + "loss": 1.1474, + "mean_token_accuracy": 0.6680416464805603, + "num_tokens": 22495714.0, + "step": 870 + }, + { + "epoch": 0.09565121897649902, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.486186981201172, + "learning_rate": 3.1844802342606146e-07, + "loss": 1.1321, + "mean_token_accuracy": 0.6684809327125549, + "num_tokens": 22520890.0, + "step": 871 + }, + { + "epoch": 0.09576103667911268, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.3009157180786133, + "learning_rate": 3.18814055636896e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.6827301979064941, + "num_tokens": 22548125.0, + "step": 872 + }, + { + "epoch": 0.09587085438172634, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.5710928440093994, + "learning_rate": 3.191800878477306e-07, + "loss": 1.049, + "mean_token_accuracy": 0.6961120367050171, + "num_tokens": 22571203.0, + "step": 873 + }, + { + "epoch": 0.09598067208434, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.334310293197632, + "learning_rate": 3.195461200585651e-07, + "loss": 1.0737, + "mean_token_accuracy": 0.680179238319397, + "num_tokens": 22600078.0, + "step": 874 + }, + { + "epoch": 0.09609048978695366, + "ewc_loss": 1.9818544387817383e-06, + "grad_norm": 2.579071521759033, + "learning_rate": 3.199121522693997e-07, + "loss": 1.1148, + "mean_token_accuracy": 0.6809896230697632, + "num_tokens": 22626236.0, + "step": 875 + }, + { + "epoch": 0.09620030748956732, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 2.3251168727874756, + "learning_rate": 3.2027818448023426e-07, + "loss": 1.1023, + "mean_token_accuracy": 0.6732990145683289, + "num_tokens": 22653233.0, + "step": 876 + }, + { + "epoch": 0.09631012519218098, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 2.414048671722412, + "learning_rate": 3.2064421669106876e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.6691671013832092, + "num_tokens": 22679272.0, + "step": 877 + }, + { + "epoch": 0.09641994289479464, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 2.5632729530334473, + "learning_rate": 3.2101024890190336e-07, + "loss": 1.0566, + "mean_token_accuracy": 0.6908649206161499, + "num_tokens": 22704003.0, + "step": 878 + }, + { + "epoch": 0.0965297605974083, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 2.4629604816436768, + "learning_rate": 3.213762811127379e-07, + "loss": 1.0433, + "mean_token_accuracy": 0.6934324502944946, + "num_tokens": 22729119.0, + "step": 879 + }, + { + "epoch": 0.09663957830002197, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 2.417001247406006, + "learning_rate": 3.2174231332357246e-07, + "loss": 1.1211, + "mean_token_accuracy": 0.6739567518234253, + "num_tokens": 22755678.0, + "step": 880 + }, + { + "epoch": 0.09674939600263563, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 2.140575408935547, + "learning_rate": 3.22108345534407e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6797490119934082, + "num_tokens": 22788600.0, + "step": 881 + }, + { + "epoch": 0.09685921370524929, + "ewc_loss": 2.041459083557129e-06, + "grad_norm": 2.369331121444702, + "learning_rate": 3.2247437774524155e-07, + "loss": 1.0195, + "mean_token_accuracy": 0.6969113349914551, + "num_tokens": 22814636.0, + "step": 882 + }, + { + "epoch": 0.09696903140786295, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.682939291000366, + "learning_rate": 3.2284040995607616e-07, + "loss": 1.1343, + "mean_token_accuracy": 0.6687453389167786, + "num_tokens": 22839289.0, + "step": 883 + }, + { + "epoch": 0.09707884911047661, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.5682239532470703, + "learning_rate": 3.2320644216691065e-07, + "loss": 1.0841, + "mean_token_accuracy": 0.6779346466064453, + "num_tokens": 22864607.0, + "step": 884 + }, + { + "epoch": 0.09718866681309027, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.590761184692383, + "learning_rate": 3.235724743777452e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6820442080497742, + "num_tokens": 22892406.0, + "step": 885 + }, + { + "epoch": 0.09729848451570393, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.6102824211120605, + "learning_rate": 3.239385065885798e-07, + "loss": 1.1078, + "mean_token_accuracy": 0.6722120642662048, + "num_tokens": 22917563.0, + "step": 886 + }, + { + "epoch": 0.09740830221831759, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.522371530532837, + "learning_rate": 3.2430453879941435e-07, + "loss": 1.0837, + "mean_token_accuracy": 0.6818591356277466, + "num_tokens": 22944690.0, + "step": 887 + }, + { + "epoch": 0.09751811992093125, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.5500922203063965, + "learning_rate": 3.2467057101024885e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.6864503622055054, + "num_tokens": 22970435.0, + "step": 888 + }, + { + "epoch": 0.09762793762354492, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.4319536685943604, + "learning_rate": 3.2503660322108345e-07, + "loss": 1.0423, + "mean_token_accuracy": 0.6958664059638977, + "num_tokens": 22996469.0, + "step": 889 + }, + { + "epoch": 0.09773775532615858, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.745380401611328, + "learning_rate": 3.25402635431918e-07, + "loss": 1.1642, + "mean_token_accuracy": 0.6699190139770508, + "num_tokens": 23019491.0, + "step": 890 + }, + { + "epoch": 0.09784757302877224, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.3234591484069824, + "learning_rate": 3.2576866764275255e-07, + "loss": 1.0817, + "mean_token_accuracy": 0.6876431703567505, + "num_tokens": 23048481.0, + "step": 891 + }, + { + "epoch": 0.0979573907313859, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.465989828109741, + "learning_rate": 3.261346998535871e-07, + "loss": 1.0834, + "mean_token_accuracy": 0.6813939213752747, + "num_tokens": 23074453.0, + "step": 892 + }, + { + "epoch": 0.09806720843399956, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.273247003555298, + "learning_rate": 3.2650073206442165e-07, + "loss": 0.9947, + "mean_token_accuracy": 0.70455002784729, + "num_tokens": 23103151.0, + "step": 893 + }, + { + "epoch": 0.09817702613661322, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.252732515335083, + "learning_rate": 3.2686676427525625e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6750545501708984, + "num_tokens": 23134289.0, + "step": 894 + }, + { + "epoch": 0.09828684383922688, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.6526029109954834, + "learning_rate": 3.2723279648609075e-07, + "loss": 1.0341, + "mean_token_accuracy": 0.7011794447898865, + "num_tokens": 23156101.0, + "step": 895 + }, + { + "epoch": 0.09839666154184054, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.580875873565674, + "learning_rate": 3.275988286969253e-07, + "loss": 1.0078, + "mean_token_accuracy": 0.7003054618835449, + "num_tokens": 23179178.0, + "step": 896 + }, + { + "epoch": 0.0985064792444542, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.7090437412261963, + "learning_rate": 3.279648609077599e-07, + "loss": 1.1084, + "mean_token_accuracy": 0.6774298548698425, + "num_tokens": 23200455.0, + "step": 897 + }, + { + "epoch": 0.09861629694706786, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.48396897315979, + "learning_rate": 3.283308931185944e-07, + "loss": 1.1014, + "mean_token_accuracy": 0.6928870677947998, + "num_tokens": 23226237.0, + "step": 898 + }, + { + "epoch": 0.09872611464968153, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.3635001182556152, + "learning_rate": 3.28696925329429e-07, + "loss": 1.088, + "mean_token_accuracy": 0.6863744258880615, + "num_tokens": 23251813.0, + "step": 899 + }, + { + "epoch": 0.0988359323522952, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.410832405090332, + "learning_rate": 3.2906295754026354e-07, + "loss": 1.1515, + "mean_token_accuracy": 0.657989501953125, + "num_tokens": 23278810.0, + "step": 900 + }, + { + "epoch": 0.09894575005490885, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.259413242340088, + "learning_rate": 3.2942898975109804e-07, + "loss": 1.1379, + "mean_token_accuracy": 0.6726586818695068, + "num_tokens": 23307162.0, + "step": 901 + }, + { + "epoch": 0.09905556775752251, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.4886209964752197, + "learning_rate": 3.2979502196193264e-07, + "loss": 1.1491, + "mean_token_accuracy": 0.6649184226989746, + "num_tokens": 23335308.0, + "step": 902 + }, + { + "epoch": 0.09916538546013617, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.542587995529175, + "learning_rate": 3.301610541727672e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.6935635805130005, + "num_tokens": 23359068.0, + "step": 903 + }, + { + "epoch": 0.09927520316274983, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.5145351886749268, + "learning_rate": 3.3052708638360174e-07, + "loss": 1.0137, + "mean_token_accuracy": 0.7057326436042786, + "num_tokens": 23381987.0, + "step": 904 + }, + { + "epoch": 0.0993850208653635, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.055111885070801, + "learning_rate": 3.308931185944363e-07, + "loss": 1.1268, + "mean_token_accuracy": 0.6714497208595276, + "num_tokens": 23420367.0, + "step": 905 + }, + { + "epoch": 0.09949483856797715, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 2.7031731605529785, + "learning_rate": 3.3125915080527084e-07, + "loss": 1.0681, + "mean_token_accuracy": 0.6882054805755615, + "num_tokens": 23442587.0, + "step": 906 + }, + { + "epoch": 0.09960465627059081, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.4807615280151367, + "learning_rate": 3.3162518301610544e-07, + "loss": 1.1347, + "mean_token_accuracy": 0.6729483008384705, + "num_tokens": 23469268.0, + "step": 907 + }, + { + "epoch": 0.09971447397320449, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.2430243492126465, + "learning_rate": 3.3199121522693994e-07, + "loss": 1.1958, + "mean_token_accuracy": 0.6466002464294434, + "num_tokens": 23500886.0, + "step": 908 + }, + { + "epoch": 0.09982429167581815, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.6835334300994873, + "learning_rate": 3.323572474377745e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6821000576019287, + "num_tokens": 23523429.0, + "step": 909 + }, + { + "epoch": 0.0999341093784318, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.569913864135742, + "learning_rate": 3.327232796486091e-07, + "loss": 1.047, + "mean_token_accuracy": 0.6974800825119019, + "num_tokens": 23545940.0, + "step": 910 + }, + { + "epoch": 0.10004392708104547, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.5626087188720703, + "learning_rate": 3.3308931185944364e-07, + "loss": 1.1915, + "mean_token_accuracy": 0.6597558259963989, + "num_tokens": 23572000.0, + "step": 911 + }, + { + "epoch": 0.10015374478365913, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.3784804344177246, + "learning_rate": 3.3345534407027813e-07, + "loss": 1.1147, + "mean_token_accuracy": 0.6782117486000061, + "num_tokens": 23600798.0, + "step": 912 + }, + { + "epoch": 0.10026356248627279, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.495741128921509, + "learning_rate": 3.3382137628111273e-07, + "loss": 1.0295, + "mean_token_accuracy": 0.6953783631324768, + "num_tokens": 23624264.0, + "step": 913 + }, + { + "epoch": 0.10037338018888645, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.234027862548828, + "learning_rate": 3.341874084919473e-07, + "loss": 1.0688, + "mean_token_accuracy": 0.6854596138000488, + "num_tokens": 23655171.0, + "step": 914 + }, + { + "epoch": 0.1004831978915001, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.321928024291992, + "learning_rate": 3.3455344070278183e-07, + "loss": 1.1293, + "mean_token_accuracy": 0.6713092923164368, + "num_tokens": 23683903.0, + "step": 915 + }, + { + "epoch": 0.10059301559411377, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.6169445514678955, + "learning_rate": 3.349194729136164e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6916524171829224, + "num_tokens": 23706406.0, + "step": 916 + }, + { + "epoch": 0.10070283329672743, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.5459578037261963, + "learning_rate": 3.3528550512445093e-07, + "loss": 1.1325, + "mean_token_accuracy": 0.6658213138580322, + "num_tokens": 23732433.0, + "step": 917 + }, + { + "epoch": 0.1008126509993411, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.249345064163208, + "learning_rate": 3.356515373352855e-07, + "loss": 1.0373, + "mean_token_accuracy": 0.6918861865997314, + "num_tokens": 23759236.0, + "step": 918 + }, + { + "epoch": 0.10092246870195476, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.4756367206573486, + "learning_rate": 3.3601756954612003e-07, + "loss": 1.0276, + "mean_token_accuracy": 0.6971300840377808, + "num_tokens": 23784398.0, + "step": 919 + }, + { + "epoch": 0.10103228640456842, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.373013734817505, + "learning_rate": 3.363836017569546e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6990135908126831, + "num_tokens": 23812650.0, + "step": 920 + }, + { + "epoch": 0.10114210410718208, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.2634544372558594, + "learning_rate": 3.367496339677892e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6808940172195435, + "num_tokens": 23841776.0, + "step": 921 + }, + { + "epoch": 0.10125192180979574, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.5683326721191406, + "learning_rate": 3.371156661786237e-07, + "loss": 1.0844, + "mean_token_accuracy": 0.6778262853622437, + "num_tokens": 23865229.0, + "step": 922 + }, + { + "epoch": 0.1013617395124094, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.4931459426879883, + "learning_rate": 3.374816983894583e-07, + "loss": 1.0883, + "mean_token_accuracy": 0.6744000911712646, + "num_tokens": 23889843.0, + "step": 923 + }, + { + "epoch": 0.10147155721502306, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 2.496544361114502, + "learning_rate": 3.3784773060029283e-07, + "loss": 1.1664, + "mean_token_accuracy": 0.6652783155441284, + "num_tokens": 23917810.0, + "step": 924 + }, + { + "epoch": 0.10158137491763672, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 2.707955837249756, + "learning_rate": 3.382137628111273e-07, + "loss": 1.0891, + "mean_token_accuracy": 0.6788564324378967, + "num_tokens": 23940531.0, + "step": 925 + }, + { + "epoch": 0.10169119262025038, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 2.3727593421936035, + "learning_rate": 3.385797950219619e-07, + "loss": 1.0798, + "mean_token_accuracy": 0.6816244721412659, + "num_tokens": 23968313.0, + "step": 926 + }, + { + "epoch": 0.10180101032286405, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 2.3189220428466797, + "learning_rate": 3.389458272327965e-07, + "loss": 1.1237, + "mean_token_accuracy": 0.6752616167068481, + "num_tokens": 23999841.0, + "step": 927 + }, + { + "epoch": 0.10191082802547771, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 2.180107593536377, + "learning_rate": 3.39311859443631e-07, + "loss": 1.1448, + "mean_token_accuracy": 0.6633089780807495, + "num_tokens": 24031207.0, + "step": 928 + }, + { + "epoch": 0.10202064572809137, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 2.2042453289031982, + "learning_rate": 3.3967789165446557e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.6719763875007629, + "num_tokens": 24060419.0, + "step": 929 + }, + { + "epoch": 0.10213046343070503, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.5369081497192383, + "learning_rate": 3.400439238653001e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6856615543365479, + "num_tokens": 24086752.0, + "step": 930 + }, + { + "epoch": 0.10224028113331869, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.542982816696167, + "learning_rate": 3.404099560761347e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6866042613983154, + "num_tokens": 24110671.0, + "step": 931 + }, + { + "epoch": 0.10235009883593235, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.2635726928710938, + "learning_rate": 3.407759882869692e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.6783899068832397, + "num_tokens": 24142021.0, + "step": 932 + }, + { + "epoch": 0.10245991653854601, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.7602031230926514, + "learning_rate": 3.4114202049780377e-07, + "loss": 1.095, + "mean_token_accuracy": 0.6788630485534668, + "num_tokens": 24163153.0, + "step": 933 + }, + { + "epoch": 0.10256973424115967, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.2486181259155273, + "learning_rate": 3.4150805270863837e-07, + "loss": 1.1585, + "mean_token_accuracy": 0.6620272994041443, + "num_tokens": 24194395.0, + "step": 934 + }, + { + "epoch": 0.10267955194377333, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.4315202236175537, + "learning_rate": 3.4187408491947287e-07, + "loss": 1.1428, + "mean_token_accuracy": 0.6651331186294556, + "num_tokens": 24220717.0, + "step": 935 + }, + { + "epoch": 0.10278936964638699, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.4940319061279297, + "learning_rate": 3.422401171303074e-07, + "loss": 1.1038, + "mean_token_accuracy": 0.6816353797912598, + "num_tokens": 24245109.0, + "step": 936 + }, + { + "epoch": 0.10289918734900066, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.9122884273529053, + "learning_rate": 3.42606149341142e-07, + "loss": 1.1733, + "mean_token_accuracy": 0.6678253412246704, + "num_tokens": 24264983.0, + "step": 937 + }, + { + "epoch": 0.10300900505161432, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.632361650466919, + "learning_rate": 3.4297218155197657e-07, + "loss": 1.0961, + "mean_token_accuracy": 0.6879681348800659, + "num_tokens": 24288101.0, + "step": 938 + }, + { + "epoch": 0.10311882275422798, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.3344151973724365, + "learning_rate": 3.433382137628111e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6854954361915588, + "num_tokens": 24314589.0, + "step": 939 + }, + { + "epoch": 0.10322864045684164, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.627803087234497, + "learning_rate": 3.4370424597364567e-07, + "loss": 1.1478, + "mean_token_accuracy": 0.6754981279373169, + "num_tokens": 24337109.0, + "step": 940 + }, + { + "epoch": 0.1033384581594553, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.2846195697784424, + "learning_rate": 3.440702781844802e-07, + "loss": 1.1341, + "mean_token_accuracy": 0.6736787557601929, + "num_tokens": 24367221.0, + "step": 941 + }, + { + "epoch": 0.10344827586206896, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.406125068664551, + "learning_rate": 3.4443631039531476e-07, + "loss": 1.1535, + "mean_token_accuracy": 0.6645626425743103, + "num_tokens": 24394210.0, + "step": 942 + }, + { + "epoch": 0.10355809356468262, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.415144681930542, + "learning_rate": 3.448023426061493e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.6961393356323242, + "num_tokens": 24419005.0, + "step": 943 + }, + { + "epoch": 0.10366791126729628, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.3318700790405273, + "learning_rate": 3.4516837481698386e-07, + "loss": 1.1025, + "mean_token_accuracy": 0.6804639101028442, + "num_tokens": 24448427.0, + "step": 944 + }, + { + "epoch": 0.10377772896990994, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.188319683074951, + "learning_rate": 3.4553440702781846e-07, + "loss": 1.2066, + "mean_token_accuracy": 0.6511316895484924, + "num_tokens": 24480209.0, + "step": 945 + }, + { + "epoch": 0.10388754667252362, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.5985207557678223, + "learning_rate": 3.4590043923865296e-07, + "loss": 1.0323, + "mean_token_accuracy": 0.696938157081604, + "num_tokens": 24503484.0, + "step": 946 + }, + { + "epoch": 0.10399736437513728, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.4451351165771484, + "learning_rate": 3.4626647144948756e-07, + "loss": 1.1064, + "mean_token_accuracy": 0.6732171773910522, + "num_tokens": 24528827.0, + "step": 947 + }, + { + "epoch": 0.10410718207775094, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.7653863430023193, + "learning_rate": 3.466325036603221e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6978664398193359, + "num_tokens": 24550278.0, + "step": 948 + }, + { + "epoch": 0.1042169997803646, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.5246646404266357, + "learning_rate": 3.469985358711566e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6961222887039185, + "num_tokens": 24573147.0, + "step": 949 + }, + { + "epoch": 0.10432681748297826, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.5420920848846436, + "learning_rate": 3.473645680819912e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6764886975288391, + "num_tokens": 24597279.0, + "step": 950 + }, + { + "epoch": 0.10443663518559192, + "ewc_loss": 2.1457672119140625e-06, + "grad_norm": 2.4793407917022705, + "learning_rate": 3.4773060029282576e-07, + "loss": 1.1283, + "mean_token_accuracy": 0.6610246896743774, + "num_tokens": 24623328.0, + "step": 951 + }, + { + "epoch": 0.10454645288820558, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.7681188583374023, + "learning_rate": 3.480966325036603e-07, + "loss": 1.0763, + "mean_token_accuracy": 0.6938923597335815, + "num_tokens": 24644301.0, + "step": 952 + }, + { + "epoch": 0.10465627059081924, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.094344139099121, + "learning_rate": 3.4846266471449486e-07, + "loss": 1.1397, + "mean_token_accuracy": 0.6696410179138184, + "num_tokens": 24678615.0, + "step": 953 + }, + { + "epoch": 0.1047660882934329, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.690368890762329, + "learning_rate": 3.488286969253294e-07, + "loss": 0.9795, + "mean_token_accuracy": 0.7058759927749634, + "num_tokens": 24701336.0, + "step": 954 + }, + { + "epoch": 0.10487590599604656, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.3835017681121826, + "learning_rate": 3.49194729136164e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6943291425704956, + "num_tokens": 24727917.0, + "step": 955 + }, + { + "epoch": 0.10498572369866023, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.7924182415008545, + "learning_rate": 3.495607613469985e-07, + "loss": 1.0319, + "mean_token_accuracy": 0.6890584826469421, + "num_tokens": 24748147.0, + "step": 956 + }, + { + "epoch": 0.10509554140127389, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.5382015705108643, + "learning_rate": 3.4992679355783305e-07, + "loss": 1.0715, + "mean_token_accuracy": 0.6890314221382141, + "num_tokens": 24773384.0, + "step": 957 + }, + { + "epoch": 0.10520535910388755, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 2.3635025024414062, + "learning_rate": 3.5029282576866766e-07, + "loss": 1.1059, + "mean_token_accuracy": 0.6723852157592773, + "num_tokens": 24801521.0, + "step": 958 + }, + { + "epoch": 0.10531517680650121, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.4671573638916016, + "learning_rate": 3.5065885797950215e-07, + "loss": 1.1273, + "mean_token_accuracy": 0.6745220422744751, + "num_tokens": 24830015.0, + "step": 959 + }, + { + "epoch": 0.10542499450911487, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.6201913356781006, + "learning_rate": 3.510248901903367e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6778579950332642, + "num_tokens": 24853232.0, + "step": 960 + }, + { + "epoch": 0.10553481221172853, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.520585536956787, + "learning_rate": 3.513909224011713e-07, + "loss": 1.1485, + "mean_token_accuracy": 0.667584240436554, + "num_tokens": 24877653.0, + "step": 961 + }, + { + "epoch": 0.10564462991434219, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.5266194343566895, + "learning_rate": 3.5175695461200585e-07, + "loss": 1.1297, + "mean_token_accuracy": 0.669834315776825, + "num_tokens": 24903434.0, + "step": 962 + }, + { + "epoch": 0.10575444761695585, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.4492712020874023, + "learning_rate": 3.521229868228404e-07, + "loss": 1.1646, + "mean_token_accuracy": 0.668046236038208, + "num_tokens": 24931347.0, + "step": 963 + }, + { + "epoch": 0.10586426531956951, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.6415510177612305, + "learning_rate": 3.5248901903367495e-07, + "loss": 1.0651, + "mean_token_accuracy": 0.6885560750961304, + "num_tokens": 24955365.0, + "step": 964 + }, + { + "epoch": 0.10597408302218318, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.732048511505127, + "learning_rate": 3.528550512445095e-07, + "loss": 1.1535, + "mean_token_accuracy": 0.6627172827720642, + "num_tokens": 24977249.0, + "step": 965 + }, + { + "epoch": 0.10608390072479684, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.284897804260254, + "learning_rate": 3.5322108345534405e-07, + "loss": 1.1641, + "mean_token_accuracy": 0.665733814239502, + "num_tokens": 25008578.0, + "step": 966 + }, + { + "epoch": 0.1061937184274105, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.7653565406799316, + "learning_rate": 3.535871156661786e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.6894586086273193, + "num_tokens": 25028258.0, + "step": 967 + }, + { + "epoch": 0.10630353613002416, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.5614895820617676, + "learning_rate": 3.5395314787701315e-07, + "loss": 1.0556, + "mean_token_accuracy": 0.6884920001029968, + "num_tokens": 25052108.0, + "step": 968 + }, + { + "epoch": 0.10641335383263782, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 2.0980138778686523, + "learning_rate": 3.5431918008784775e-07, + "loss": 1.1169, + "mean_token_accuracy": 0.6777213215827942, + "num_tokens": 25085379.0, + "step": 969 + }, + { + "epoch": 0.10652317153525148, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.637098789215088, + "learning_rate": 3.5468521229868224e-07, + "loss": 1.0949, + "mean_token_accuracy": 0.6809373497962952, + "num_tokens": 25108319.0, + "step": 970 + }, + { + "epoch": 0.10663298923786514, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.3091275691986084, + "learning_rate": 3.5505124450951685e-07, + "loss": 1.2022, + "mean_token_accuracy": 0.6577115058898926, + "num_tokens": 25139744.0, + "step": 971 + }, + { + "epoch": 0.1067428069404788, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.4136815071105957, + "learning_rate": 3.554172767203514e-07, + "loss": 1.085, + "mean_token_accuracy": 0.6814132928848267, + "num_tokens": 25166647.0, + "step": 972 + }, + { + "epoch": 0.10685262464309246, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.2907907962799072, + "learning_rate": 3.557833089311859e-07, + "loss": 1.0698, + "mean_token_accuracy": 0.6879516839981079, + "num_tokens": 25195503.0, + "step": 973 + }, + { + "epoch": 0.10696244234570612, + "ewc_loss": 2.205371856689453e-06, + "grad_norm": 2.467869758605957, + "learning_rate": 3.561493411420205e-07, + "loss": 1.0532, + "mean_token_accuracy": 0.6909594535827637, + "num_tokens": 25220138.0, + "step": 974 + }, + { + "epoch": 0.1070722600483198, + "ewc_loss": 2.2351741790771484e-06, + "grad_norm": 2.482585906982422, + "learning_rate": 3.5651537335285504e-07, + "loss": 1.0645, + "mean_token_accuracy": 0.6834393739700317, + "num_tokens": 25244278.0, + "step": 975 + }, + { + "epoch": 0.10718207775093345, + "ewc_loss": 2.2351741790771484e-06, + "grad_norm": 2.338942766189575, + "learning_rate": 3.5688140556368954e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.6705430746078491, + "num_tokens": 25275205.0, + "step": 976 + }, + { + "epoch": 0.10729189545354711, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.658623218536377, + "learning_rate": 3.5724743777452414e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6919629573822021, + "num_tokens": 25296551.0, + "step": 977 + }, + { + "epoch": 0.10740171315616077, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.5565080642700195, + "learning_rate": 3.576134699853587e-07, + "loss": 0.9492, + "mean_token_accuracy": 0.7191322445869446, + "num_tokens": 25319702.0, + "step": 978 + }, + { + "epoch": 0.10751153085877443, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.458294153213501, + "learning_rate": 3.579795021961933e-07, + "loss": 1.1393, + "mean_token_accuracy": 0.667262613773346, + "num_tokens": 25345390.0, + "step": 979 + }, + { + "epoch": 0.1076213485613881, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 2.4055702686309814, + "learning_rate": 3.583455344070278e-07, + "loss": 1.0645, + "mean_token_accuracy": 0.6855416297912598, + "num_tokens": 25371403.0, + "step": 980 + }, + { + "epoch": 0.10773116626400175, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 2.7783565521240234, + "learning_rate": 3.5871156661786234e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.6979764699935913, + "num_tokens": 25389771.0, + "step": 981 + }, + { + "epoch": 0.10784098396661541, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 2.684598207473755, + "learning_rate": 3.5907759882869694e-07, + "loss": 1.0525, + "mean_token_accuracy": 0.6862291097640991, + "num_tokens": 25410123.0, + "step": 982 + }, + { + "epoch": 0.10795080166922907, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.6334214210510254, + "learning_rate": 3.5944363103953144e-07, + "loss": 1.0627, + "mean_token_accuracy": 0.6823060512542725, + "num_tokens": 25432542.0, + "step": 983 + }, + { + "epoch": 0.10806061937184275, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.395592212677002, + "learning_rate": 3.59809663250366e-07, + "loss": 1.1117, + "mean_token_accuracy": 0.6712405681610107, + "num_tokens": 25458481.0, + "step": 984 + }, + { + "epoch": 0.10817043707445641, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.496279001235962, + "learning_rate": 3.601756954612006e-07, + "loss": 1.0308, + "mean_token_accuracy": 0.7001397609710693, + "num_tokens": 25483261.0, + "step": 985 + }, + { + "epoch": 0.10828025477707007, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.471064805984497, + "learning_rate": 3.6054172767203514e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.6978334188461304, + "num_tokens": 25511391.0, + "step": 986 + }, + { + "epoch": 0.10839007247968373, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.1995792388916016, + "learning_rate": 3.609077598828697e-07, + "loss": 1.1084, + "mean_token_accuracy": 0.6756153106689453, + "num_tokens": 25541179.0, + "step": 987 + }, + { + "epoch": 0.10849989018229739, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.6318538188934326, + "learning_rate": 3.6127379209370423e-07, + "loss": 1.0187, + "mean_token_accuracy": 0.7004084587097168, + "num_tokens": 25564057.0, + "step": 988 + }, + { + "epoch": 0.10860970788491105, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.407278060913086, + "learning_rate": 3.616398243045388e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6946596503257751, + "num_tokens": 25590617.0, + "step": 989 + }, + { + "epoch": 0.1087195255875247, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.8950414657592773, + "learning_rate": 3.6200585651537333e-07, + "loss": 1.0614, + "mean_token_accuracy": 0.6892654895782471, + "num_tokens": 25610733.0, + "step": 990 + }, + { + "epoch": 0.10882934329013837, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.5494415760040283, + "learning_rate": 3.623718887262079e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.679350733757019, + "num_tokens": 25636827.0, + "step": 991 + }, + { + "epoch": 0.10893916099275203, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.5831615924835205, + "learning_rate": 3.6273792093704243e-07, + "loss": 1.1237, + "mean_token_accuracy": 0.6758301258087158, + "num_tokens": 25659218.0, + "step": 992 + }, + { + "epoch": 0.10904897869536569, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.638615131378174, + "learning_rate": 3.6310395314787703e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6830940842628479, + "num_tokens": 25682816.0, + "step": 993 + }, + { + "epoch": 0.10915879639797936, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.3016958236694336, + "learning_rate": 3.6346998535871153e-07, + "loss": 1.0977, + "mean_token_accuracy": 0.6786888837814331, + "num_tokens": 25712238.0, + "step": 994 + }, + { + "epoch": 0.10926861410059302, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 2.4645965099334717, + "learning_rate": 3.6383601756954613e-07, + "loss": 0.9912, + "mean_token_accuracy": 0.6973376274108887, + "num_tokens": 25737964.0, + "step": 995 + }, + { + "epoch": 0.10937843180320668, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.305593252182007, + "learning_rate": 3.642020497803807e-07, + "loss": 1.0174, + "mean_token_accuracy": 0.6976227760314941, + "num_tokens": 25765993.0, + "step": 996 + }, + { + "epoch": 0.10948824950582034, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.672255277633667, + "learning_rate": 3.645680819912152e-07, + "loss": 1.0617, + "mean_token_accuracy": 0.6894952058792114, + "num_tokens": 25789007.0, + "step": 997 + }, + { + "epoch": 0.109598067208434, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.4672865867614746, + "learning_rate": 3.649341142020498e-07, + "loss": 0.9923, + "mean_token_accuracy": 0.7059544324874878, + "num_tokens": 25814004.0, + "step": 998 + }, + { + "epoch": 0.10970788491104766, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.682279109954834, + "learning_rate": 3.653001464128843e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6873344779014587, + "num_tokens": 25835515.0, + "step": 999 + }, + { + "epoch": 0.10981770261366132, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.5581672191619873, + "learning_rate": 3.656661786237188e-07, + "loss": 1.1071, + "mean_token_accuracy": 0.6813325881958008, + "num_tokens": 25859300.0, + "step": 1000 + }, + { + "epoch": 0.10992752031627498, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.3424324989318848, + "learning_rate": 3.660322108345534e-07, + "loss": 1.0448, + "mean_token_accuracy": 0.6899235248565674, + "num_tokens": 25888053.0, + "step": 1001 + }, + { + "epoch": 0.11003733801888864, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.4469988346099854, + "learning_rate": 3.66398243045388e-07, + "loss": 1.1599, + "mean_token_accuracy": 0.6607282161712646, + "num_tokens": 25913789.0, + "step": 1002 + }, + { + "epoch": 0.11014715572150231, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.633223533630371, + "learning_rate": 3.667642752562226e-07, + "loss": 1.1697, + "mean_token_accuracy": 0.669622540473938, + "num_tokens": 25936587.0, + "step": 1003 + }, + { + "epoch": 0.11025697342411597, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 2.2316203117370605, + "learning_rate": 3.6713030746705707e-07, + "loss": 1.0542, + "mean_token_accuracy": 0.6887749433517456, + "num_tokens": 25965940.0, + "step": 1004 + }, + { + "epoch": 0.11036679112672963, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.294215679168701, + "learning_rate": 3.674963396778916e-07, + "loss": 0.9952, + "mean_token_accuracy": 0.7068980932235718, + "num_tokens": 25993188.0, + "step": 1005 + }, + { + "epoch": 0.11047660882934329, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.4806015491485596, + "learning_rate": 3.678623718887262e-07, + "loss": 1.0329, + "mean_token_accuracy": 0.6922914981842041, + "num_tokens": 26016160.0, + "step": 1006 + }, + { + "epoch": 0.11058642653195695, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.548187017440796, + "learning_rate": 3.682284040995607e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.6893337965011597, + "num_tokens": 26039892.0, + "step": 1007 + }, + { + "epoch": 0.11069624423457061, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.631969690322876, + "learning_rate": 3.6859443631039527e-07, + "loss": 1.0742, + "mean_token_accuracy": 0.6811760663986206, + "num_tokens": 26061680.0, + "step": 1008 + }, + { + "epoch": 0.11080606193718427, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.428661584854126, + "learning_rate": 3.6896046852122987e-07, + "loss": 1.1691, + "mean_token_accuracy": 0.6573650240898132, + "num_tokens": 26088811.0, + "step": 1009 + }, + { + "epoch": 0.11091587963979793, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 2.561483144760132, + "learning_rate": 3.693265007320644e-07, + "loss": 1.1298, + "mean_token_accuracy": 0.670204758644104, + "num_tokens": 26113462.0, + "step": 1010 + }, + { + "epoch": 0.11102569734241159, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 2.3862264156341553, + "learning_rate": 3.6969253294289897e-07, + "loss": 1.1565, + "mean_token_accuracy": 0.6630048155784607, + "num_tokens": 26140867.0, + "step": 1011 + }, + { + "epoch": 0.11113551504502525, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 2.340144634246826, + "learning_rate": 3.700585651537335e-07, + "loss": 1.0762, + "mean_token_accuracy": 0.6779847145080566, + "num_tokens": 26168121.0, + "step": 1012 + }, + { + "epoch": 0.11124533274763893, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 2.73001766204834, + "learning_rate": 3.7042459736456807e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6905218958854675, + "num_tokens": 26189626.0, + "step": 1013 + }, + { + "epoch": 0.11135515045025259, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 2.634979248046875, + "learning_rate": 3.707906295754026e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6759074330329895, + "num_tokens": 26213658.0, + "step": 1014 + }, + { + "epoch": 0.11146496815286625, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 2.198227882385254, + "learning_rate": 3.7115666178623716e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.6895456314086914, + "num_tokens": 26243731.0, + "step": 1015 + }, + { + "epoch": 0.1115747858554799, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 2.730806589126587, + "learning_rate": 3.715226939970717e-07, + "loss": 1.0673, + "mean_token_accuracy": 0.6829624176025391, + "num_tokens": 26266656.0, + "step": 1016 + }, + { + "epoch": 0.11168460355809356, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 3.004818916320801, + "learning_rate": 3.7188872620790626e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6929221749305725, + "num_tokens": 26285162.0, + "step": 1017 + }, + { + "epoch": 0.11179442126070722, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.541492462158203, + "learning_rate": 3.722547584187408e-07, + "loss": 1.0477, + "mean_token_accuracy": 0.6899318099021912, + "num_tokens": 26309229.0, + "step": 1018 + }, + { + "epoch": 0.11190423896332088, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.5162110328674316, + "learning_rate": 3.726207906295754e-07, + "loss": 1.1083, + "mean_token_accuracy": 0.6787024736404419, + "num_tokens": 26334702.0, + "step": 1019 + }, + { + "epoch": 0.11201405666593454, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.6300649642944336, + "learning_rate": 3.7298682284040996e-07, + "loss": 1.0691, + "mean_token_accuracy": 0.6941543817520142, + "num_tokens": 26359954.0, + "step": 1020 + }, + { + "epoch": 0.1121238743685482, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.5010948181152344, + "learning_rate": 3.7335285505124446e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6963381767272949, + "num_tokens": 26383642.0, + "step": 1021 + }, + { + "epoch": 0.11223369207116188, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.264033317565918, + "learning_rate": 3.7371888726207906e-07, + "loss": 1.0387, + "mean_token_accuracy": 0.6980214715003967, + "num_tokens": 26413718.0, + "step": 1022 + }, + { + "epoch": 0.11234350977377554, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.1124370098114014, + "learning_rate": 3.740849194729136e-07, + "loss": 1.1543, + "mean_token_accuracy": 0.6656838655471802, + "num_tokens": 26448070.0, + "step": 1023 + }, + { + "epoch": 0.1124533274763892, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.3240604400634766, + "learning_rate": 3.744509516837481e-07, + "loss": 1.1619, + "mean_token_accuracy": 0.6629778146743774, + "num_tokens": 26477876.0, + "step": 1024 + }, + { + "epoch": 0.11256314517900286, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 2.570847511291504, + "learning_rate": 3.748169838945827e-07, + "loss": 1.1714, + "mean_token_accuracy": 0.654467761516571, + "num_tokens": 26503463.0, + "step": 1025 + }, + { + "epoch": 0.11267296288161652, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.3962535858154297, + "learning_rate": 3.7518301610541726e-07, + "loss": 1.0691, + "mean_token_accuracy": 0.6864923238754272, + "num_tokens": 26530910.0, + "step": 1026 + }, + { + "epoch": 0.11278278058423018, + "ewc_loss": 2.384185791015625e-06, + "grad_norm": 2.4245047569274902, + "learning_rate": 3.7554904831625186e-07, + "loss": 1.1334, + "mean_token_accuracy": 0.6651598215103149, + "num_tokens": 26556511.0, + "step": 1027 + }, + { + "epoch": 0.11289259828684384, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.8309569358825684, + "learning_rate": 3.7591508052708636e-07, + "loss": 1.0913, + "mean_token_accuracy": 0.6746019124984741, + "num_tokens": 26577702.0, + "step": 1028 + }, + { + "epoch": 0.1130024159894575, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.53920316696167, + "learning_rate": 3.762811127379209e-07, + "loss": 1.1619, + "mean_token_accuracy": 0.6695722937583923, + "num_tokens": 26602465.0, + "step": 1029 + }, + { + "epoch": 0.11311223369207116, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.7751998901367188, + "learning_rate": 3.766471449487555e-07, + "loss": 1.0619, + "mean_token_accuracy": 0.683074951171875, + "num_tokens": 26622436.0, + "step": 1030 + }, + { + "epoch": 0.11322205139468482, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.469318389892578, + "learning_rate": 3.7701317715959e-07, + "loss": 1.0724, + "mean_token_accuracy": 0.6865867972373962, + "num_tokens": 26646386.0, + "step": 1031 + }, + { + "epoch": 0.11333186909729849, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.424290895462036, + "learning_rate": 3.7737920937042455e-07, + "loss": 1.1681, + "mean_token_accuracy": 0.6600143909454346, + "num_tokens": 26674495.0, + "step": 1032 + }, + { + "epoch": 0.11344168679991215, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.364516019821167, + "learning_rate": 3.7774524158125915e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6926836967468262, + "num_tokens": 26701395.0, + "step": 1033 + }, + { + "epoch": 0.11355150450252581, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.6259429454803467, + "learning_rate": 3.781112737920937e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6758246421813965, + "num_tokens": 26724537.0, + "step": 1034 + }, + { + "epoch": 0.11366132220513947, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.49991512298584, + "learning_rate": 3.7847730600292825e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6779454946517944, + "num_tokens": 26747889.0, + "step": 1035 + }, + { + "epoch": 0.11377113990775313, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 3.0139665603637695, + "learning_rate": 3.788433382137628e-07, + "loss": 1.0175, + "mean_token_accuracy": 0.6973746418952942, + "num_tokens": 26767707.0, + "step": 1036 + }, + { + "epoch": 0.11388095761036679, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.4135854244232178, + "learning_rate": 3.7920937042459735e-07, + "loss": 1.1343, + "mean_token_accuracy": 0.6812331080436707, + "num_tokens": 26793032.0, + "step": 1037 + }, + { + "epoch": 0.11399077531298045, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 2.4756247997283936, + "learning_rate": 3.795754026354319e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.6893901228904724, + "num_tokens": 26819051.0, + "step": 1038 + }, + { + "epoch": 0.11410059301559411, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 2.5764920711517334, + "learning_rate": 3.7994143484626645e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.6931675672531128, + "num_tokens": 26842616.0, + "step": 1039 + }, + { + "epoch": 0.11421041071820777, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 2.9338018894195557, + "learning_rate": 3.80307467057101e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6825668215751648, + "num_tokens": 26865703.0, + "step": 1040 + }, + { + "epoch": 0.11432022842082144, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 3.012545347213745, + "learning_rate": 3.8067349926793555e-07, + "loss": 1.1115, + "mean_token_accuracy": 0.668535053730011, + "num_tokens": 26884984.0, + "step": 1041 + }, + { + "epoch": 0.1144300461234351, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 2.4332799911499023, + "learning_rate": 3.810395314787701e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6836732625961304, + "num_tokens": 26912097.0, + "step": 1042 + }, + { + "epoch": 0.11453986382604876, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.7074482440948486, + "learning_rate": 3.814055636896047e-07, + "loss": 1.0229, + "mean_token_accuracy": 0.6993698477745056, + "num_tokens": 26932033.0, + "step": 1043 + }, + { + "epoch": 0.11464968152866242, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.572394847869873, + "learning_rate": 3.8177159590043925e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.67563796043396, + "num_tokens": 26954418.0, + "step": 1044 + }, + { + "epoch": 0.11475949923127608, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.2795157432556152, + "learning_rate": 3.8213762811127374e-07, + "loss": 1.0716, + "mean_token_accuracy": 0.6854556202888489, + "num_tokens": 26984406.0, + "step": 1045 + }, + { + "epoch": 0.11486931693388974, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.4697818756103516, + "learning_rate": 3.8250366032210835e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.688709557056427, + "num_tokens": 27012181.0, + "step": 1046 + }, + { + "epoch": 0.1149791346365034, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.2968904972076416, + "learning_rate": 3.828696925329429e-07, + "loss": 1.0966, + "mean_token_accuracy": 0.6781215667724609, + "num_tokens": 27040817.0, + "step": 1047 + }, + { + "epoch": 0.11508895233911706, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.5451340675354004, + "learning_rate": 3.832357247437774e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.7110108733177185, + "num_tokens": 27063531.0, + "step": 1048 + }, + { + "epoch": 0.11519877004173072, + "ewc_loss": 2.428889274597168e-06, + "grad_norm": 2.58406925201416, + "learning_rate": 3.83601756954612e-07, + "loss": 1.0461, + "mean_token_accuracy": 0.6858173608779907, + "num_tokens": 27085193.0, + "step": 1049 + }, + { + "epoch": 0.11530858774434438, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.3968334197998047, + "learning_rate": 3.8396778916544654e-07, + "loss": 1.0901, + "mean_token_accuracy": 0.6747442483901978, + "num_tokens": 27112026.0, + "step": 1050 + }, + { + "epoch": 0.11541840544695806, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.66580867767334, + "learning_rate": 3.8433382137628114e-07, + "loss": 1.0859, + "mean_token_accuracy": 0.6852315068244934, + "num_tokens": 27134736.0, + "step": 1051 + }, + { + "epoch": 0.11552822314957172, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.2046382427215576, + "learning_rate": 3.8469985358711564e-07, + "loss": 1.1974, + "mean_token_accuracy": 0.6510428190231323, + "num_tokens": 27167154.0, + "step": 1052 + }, + { + "epoch": 0.11563804085218538, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.4074361324310303, + "learning_rate": 3.850658857979502e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6927018165588379, + "num_tokens": 27191779.0, + "step": 1053 + }, + { + "epoch": 0.11574785855479904, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 2.745878219604492, + "learning_rate": 3.854319180087848e-07, + "loss": 1.0739, + "mean_token_accuracy": 0.6802382469177246, + "num_tokens": 27214207.0, + "step": 1054 + }, + { + "epoch": 0.1158576762574127, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.4476499557495117, + "learning_rate": 3.857979502196193e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6802989840507507, + "num_tokens": 27240519.0, + "step": 1055 + }, + { + "epoch": 0.11596749396002635, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.423013210296631, + "learning_rate": 3.8616398243045384e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6963762044906616, + "num_tokens": 27265773.0, + "step": 1056 + }, + { + "epoch": 0.11607731166264001, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.323941230773926, + "learning_rate": 3.8653001464128844e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6671005487442017, + "num_tokens": 27294062.0, + "step": 1057 + }, + { + "epoch": 0.11618712936525367, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.604398250579834, + "learning_rate": 3.8689604685212293e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6858885288238525, + "num_tokens": 27317200.0, + "step": 1058 + }, + { + "epoch": 0.11629694706786733, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 2.272949695587158, + "learning_rate": 3.8726207906295754e-07, + "loss": 1.122, + "mean_token_accuracy": 0.6729040145874023, + "num_tokens": 27345129.0, + "step": 1059 + }, + { + "epoch": 0.11640676477048101, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 2.29331374168396, + "learning_rate": 3.876281112737921e-07, + "loss": 1.1367, + "mean_token_accuracy": 0.668975293636322, + "num_tokens": 27376500.0, + "step": 1060 + }, + { + "epoch": 0.11651658247309467, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 2.5014612674713135, + "learning_rate": 3.8799414348462663e-07, + "loss": 1.1181, + "mean_token_accuracy": 0.6669249534606934, + "num_tokens": 27405136.0, + "step": 1061 + }, + { + "epoch": 0.11662640017570833, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 2.4419634342193604, + "learning_rate": 3.883601756954612e-07, + "loss": 1.0737, + "mean_token_accuracy": 0.6792746186256409, + "num_tokens": 27430922.0, + "step": 1062 + }, + { + "epoch": 0.11673621787832199, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 2.7297308444976807, + "learning_rate": 3.8872620790629573e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.6927100419998169, + "num_tokens": 27452160.0, + "step": 1063 + }, + { + "epoch": 0.11684603558093565, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 2.4556822776794434, + "learning_rate": 3.890922401171303e-07, + "loss": 1.103, + "mean_token_accuracy": 0.6791183948516846, + "num_tokens": 27477265.0, + "step": 1064 + }, + { + "epoch": 0.11695585328354931, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.455683946609497, + "learning_rate": 3.8945827232796483e-07, + "loss": 1.1343, + "mean_token_accuracy": 0.6705330610275269, + "num_tokens": 27503052.0, + "step": 1065 + }, + { + "epoch": 0.11706567098616297, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.588963031768799, + "learning_rate": 3.898243045387994e-07, + "loss": 1.0582, + "mean_token_accuracy": 0.693008542060852, + "num_tokens": 27526173.0, + "step": 1066 + }, + { + "epoch": 0.11717548868877663, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.1883411407470703, + "learning_rate": 3.90190336749634e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.6863207221031189, + "num_tokens": 27556183.0, + "step": 1067 + }, + { + "epoch": 0.11728530639139029, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.3204691410064697, + "learning_rate": 3.9055636896046853e-07, + "loss": 1.0766, + "mean_token_accuracy": 0.6845946311950684, + "num_tokens": 27583306.0, + "step": 1068 + }, + { + "epoch": 0.11739512409400395, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.4459190368652344, + "learning_rate": 3.9092240117130303e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6924159526824951, + "num_tokens": 27607080.0, + "step": 1069 + }, + { + "epoch": 0.11750494179661762, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.1312272548675537, + "learning_rate": 3.9128843338213763e-07, + "loss": 1.1371, + "mean_token_accuracy": 0.6690291166305542, + "num_tokens": 27639131.0, + "step": 1070 + }, + { + "epoch": 0.11761475949923128, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.6549980640411377, + "learning_rate": 3.916544655929722e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6825985908508301, + "num_tokens": 27660915.0, + "step": 1071 + }, + { + "epoch": 0.11772457720184494, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.211665391921997, + "learning_rate": 3.920204978038067e-07, + "loss": 1.1134, + "mean_token_accuracy": 0.673056960105896, + "num_tokens": 27693707.0, + "step": 1072 + }, + { + "epoch": 0.1178343949044586, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.4045047760009766, + "learning_rate": 3.923865300146413e-07, + "loss": 1.1931, + "mean_token_accuracy": 0.6565571427345276, + "num_tokens": 27721535.0, + "step": 1073 + }, + { + "epoch": 0.11794421260707226, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 2.5924575328826904, + "learning_rate": 3.927525622254758e-07, + "loss": 0.9937, + "mean_token_accuracy": 0.6998584270477295, + "num_tokens": 27744277.0, + "step": 1074 + }, + { + "epoch": 0.11805403030968592, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.2563202381134033, + "learning_rate": 3.9311859443631043e-07, + "loss": 1.1091, + "mean_token_accuracy": 0.6759588718414307, + "num_tokens": 27773010.0, + "step": 1075 + }, + { + "epoch": 0.11816384801229958, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.7095134258270264, + "learning_rate": 3.934846266471449e-07, + "loss": 1.1352, + "mean_token_accuracy": 0.6819672584533691, + "num_tokens": 27794162.0, + "step": 1076 + }, + { + "epoch": 0.11827366571491324, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.5235419273376465, + "learning_rate": 3.9385065885797947e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.677783191204071, + "num_tokens": 27817334.0, + "step": 1077 + }, + { + "epoch": 0.1183834834175269, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.60482120513916, + "learning_rate": 3.942166910688141e-07, + "loss": 1.1555, + "mean_token_accuracy": 0.6582193374633789, + "num_tokens": 27841083.0, + "step": 1078 + }, + { + "epoch": 0.11849330112014057, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.7868597507476807, + "learning_rate": 3.9458272327964857e-07, + "loss": 1.0202, + "mean_token_accuracy": 0.6912541389465332, + "num_tokens": 27861503.0, + "step": 1079 + }, + { + "epoch": 0.11860311882275423, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 2.4002857208251953, + "learning_rate": 3.949487554904831e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.6955135464668274, + "num_tokens": 27888130.0, + "step": 1080 + }, + { + "epoch": 0.1187129365253679, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.417292594909668, + "learning_rate": 3.953147877013177e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.681882917881012, + "num_tokens": 27913796.0, + "step": 1081 + }, + { + "epoch": 0.11882275422798155, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.4602482318878174, + "learning_rate": 3.956808199121522e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.6805737018585205, + "num_tokens": 27938790.0, + "step": 1082 + }, + { + "epoch": 0.11893257193059521, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.315723180770874, + "learning_rate": 3.960468521229868e-07, + "loss": 1.0675, + "mean_token_accuracy": 0.684099555015564, + "num_tokens": 27964042.0, + "step": 1083 + }, + { + "epoch": 0.11904238963320887, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.4275903701782227, + "learning_rate": 3.9641288433382137e-07, + "loss": 1.0856, + "mean_token_accuracy": 0.6862974166870117, + "num_tokens": 27990871.0, + "step": 1084 + }, + { + "epoch": 0.11915220733582253, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.301344871520996, + "learning_rate": 3.967789165446559e-07, + "loss": 1.0858, + "mean_token_accuracy": 0.6726789474487305, + "num_tokens": 28019249.0, + "step": 1085 + }, + { + "epoch": 0.11926202503843619, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.3099567890167236, + "learning_rate": 3.9714494875549047e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6745967864990234, + "num_tokens": 28047209.0, + "step": 1086 + }, + { + "epoch": 0.11937184274104985, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 2.2349436283111572, + "learning_rate": 3.97510980966325e-07, + "loss": 1.1643, + "mean_token_accuracy": 0.6704249382019043, + "num_tokens": 28076909.0, + "step": 1087 + }, + { + "epoch": 0.11948166044366351, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 2.3171961307525635, + "learning_rate": 3.9787701317715957e-07, + "loss": 1.1667, + "mean_token_accuracy": 0.6602007150650024, + "num_tokens": 28105066.0, + "step": 1088 + }, + { + "epoch": 0.11959147814627719, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.780076742172241, + "learning_rate": 3.982430453879941e-07, + "loss": 1.0691, + "mean_token_accuracy": 0.6777362823486328, + "num_tokens": 28126388.0, + "step": 1089 + }, + { + "epoch": 0.11970129584889085, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 2.4763529300689697, + "learning_rate": 3.9860907759882866e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6921911239624023, + "num_tokens": 28152525.0, + "step": 1090 + }, + { + "epoch": 0.1198111135515045, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.496523141860962, + "learning_rate": 3.9897510980966327e-07, + "loss": 0.9996, + "mean_token_accuracy": 0.7046142816543579, + "num_tokens": 28176649.0, + "step": 1091 + }, + { + "epoch": 0.11992093125411817, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 3.029327392578125, + "learning_rate": 3.993411420204978e-07, + "loss": 0.984, + "mean_token_accuracy": 0.714176595211029, + "num_tokens": 28194986.0, + "step": 1092 + }, + { + "epoch": 0.12003074895673183, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.266615629196167, + "learning_rate": 3.997071742313323e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6842330694198608, + "num_tokens": 28225155.0, + "step": 1093 + }, + { + "epoch": 0.12014056665934549, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.2653398513793945, + "learning_rate": 4.000732064421669e-07, + "loss": 1.069, + "mean_token_accuracy": 0.6806526184082031, + "num_tokens": 28254599.0, + "step": 1094 + }, + { + "epoch": 0.12025038436195915, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.638590097427368, + "learning_rate": 4.0043923865300146e-07, + "loss": 1.0134, + "mean_token_accuracy": 0.6949033737182617, + "num_tokens": 28277178.0, + "step": 1095 + }, + { + "epoch": 0.1203602020645728, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.436009407043457, + "learning_rate": 4.0080527086383596e-07, + "loss": 1.0804, + "mean_token_accuracy": 0.6847154498100281, + "num_tokens": 28304301.0, + "step": 1096 + }, + { + "epoch": 0.12047001976718646, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.4897913932800293, + "learning_rate": 4.0117130307467056e-07, + "loss": 0.9676, + "mean_token_accuracy": 0.71134352684021, + "num_tokens": 28328572.0, + "step": 1097 + }, + { + "epoch": 0.12057983746980014, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.6761248111724854, + "learning_rate": 4.015373352855051e-07, + "loss": 1.0588, + "mean_token_accuracy": 0.6868234276771545, + "num_tokens": 28352705.0, + "step": 1098 + }, + { + "epoch": 0.1206896551724138, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.4432432651519775, + "learning_rate": 4.0190336749633966e-07, + "loss": 1.1102, + "mean_token_accuracy": 0.6781558990478516, + "num_tokens": 28379284.0, + "step": 1099 + }, + { + "epoch": 0.12079947287502746, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.013402223587036, + "learning_rate": 4.022693997071742e-07, + "loss": 1.1052, + "mean_token_accuracy": 0.6782757043838501, + "num_tokens": 28412910.0, + "step": 1100 + }, + { + "epoch": 0.12090929057764112, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.327934741973877, + "learning_rate": 4.0263543191800876e-07, + "loss": 1.0229, + "mean_token_accuracy": 0.7041801810264587, + "num_tokens": 28439611.0, + "step": 1101 + }, + { + "epoch": 0.12101910828025478, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 2.4647316932678223, + "learning_rate": 4.0300146412884336e-07, + "loss": 1.1309, + "mean_token_accuracy": 0.6734409928321838, + "num_tokens": 28464781.0, + "step": 1102 + }, + { + "epoch": 0.12112892598286844, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 2.697345495223999, + "learning_rate": 4.0336749633967785e-07, + "loss": 1.0372, + "mean_token_accuracy": 0.6903091669082642, + "num_tokens": 28486031.0, + "step": 1103 + }, + { + "epoch": 0.1212387436854821, + "ewc_loss": 2.5779008865356445e-06, + "grad_norm": 2.6124165058135986, + "learning_rate": 4.037335285505124e-07, + "loss": 0.9818, + "mean_token_accuracy": 0.7050272822380066, + "num_tokens": 28507282.0, + "step": 1104 + }, + { + "epoch": 0.12134856138809576, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.5914623737335205, + "learning_rate": 4.04099560761347e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.6815028190612793, + "num_tokens": 28532491.0, + "step": 1105 + }, + { + "epoch": 0.12145837909070942, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.2612013816833496, + "learning_rate": 4.044655929721815e-07, + "loss": 1.0226, + "mean_token_accuracy": 0.7055591940879822, + "num_tokens": 28559439.0, + "step": 1106 + }, + { + "epoch": 0.12156819679332308, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.472444534301758, + "learning_rate": 4.048316251830161e-07, + "loss": 1.1214, + "mean_token_accuracy": 0.6719682216644287, + "num_tokens": 28584914.0, + "step": 1107 + }, + { + "epoch": 0.12167801449593675, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.2887775897979736, + "learning_rate": 4.0519765739385065e-07, + "loss": 1.0853, + "mean_token_accuracy": 0.6849929094314575, + "num_tokens": 28613358.0, + "step": 1108 + }, + { + "epoch": 0.12178783219855041, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.493931531906128, + "learning_rate": 4.055636896046852e-07, + "loss": 1.1164, + "mean_token_accuracy": 0.6854325532913208, + "num_tokens": 28637827.0, + "step": 1109 + }, + { + "epoch": 0.12189764990116407, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.4016387462615967, + "learning_rate": 4.0592972181551975e-07, + "loss": 1.169, + "mean_token_accuracy": 0.6631457805633545, + "num_tokens": 28663784.0, + "step": 1110 + }, + { + "epoch": 0.12200746760377773, + "ewc_loss": 2.592802047729492e-06, + "grad_norm": 2.4957098960876465, + "learning_rate": 4.062957540263543e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.668194591999054, + "num_tokens": 28686562.0, + "step": 1111 + }, + { + "epoch": 0.12211728530639139, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.675853967666626, + "learning_rate": 4.0666178623718885e-07, + "loss": 1.1268, + "mean_token_accuracy": 0.6788976192474365, + "num_tokens": 28713361.0, + "step": 1112 + }, + { + "epoch": 0.12222710300900505, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.5616579055786133, + "learning_rate": 4.070278184480234e-07, + "loss": 1.0473, + "mean_token_accuracy": 0.6897940039634705, + "num_tokens": 28736221.0, + "step": 1113 + }, + { + "epoch": 0.12233692071161871, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.140347719192505, + "learning_rate": 4.0739385065885795e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.6938940286636353, + "num_tokens": 28767480.0, + "step": 1114 + }, + { + "epoch": 0.12244673841423237, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.6293981075286865, + "learning_rate": 4.0775988286969255e-07, + "loss": 1.0456, + "mean_token_accuracy": 0.6950984597206116, + "num_tokens": 28790622.0, + "step": 1115 + }, + { + "epoch": 0.12255655611684603, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.3752782344818115, + "learning_rate": 4.081259150805271e-07, + "loss": 1.087, + "mean_token_accuracy": 0.6901484727859497, + "num_tokens": 28816316.0, + "step": 1116 + }, + { + "epoch": 0.1226663738194597, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 2.511455535888672, + "learning_rate": 4.084919472913616e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6868244409561157, + "num_tokens": 28838421.0, + "step": 1117 + }, + { + "epoch": 0.12277619152207336, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.681427001953125, + "learning_rate": 4.088579795021962e-07, + "loss": 1.1736, + "mean_token_accuracy": 0.6622726917266846, + "num_tokens": 28863527.0, + "step": 1118 + }, + { + "epoch": 0.12288600922468702, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.75484561920166, + "learning_rate": 4.0922401171303075e-07, + "loss": 0.9218, + "mean_token_accuracy": 0.7245892882347107, + "num_tokens": 28882731.0, + "step": 1119 + }, + { + "epoch": 0.12299582692730068, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.5748403072357178, + "learning_rate": 4.0959004392386524e-07, + "loss": 1.0907, + "mean_token_accuracy": 0.6773073673248291, + "num_tokens": 28906336.0, + "step": 1120 + }, + { + "epoch": 0.12310564462991434, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.5651309490203857, + "learning_rate": 4.0995607613469984e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.7061198949813843, + "num_tokens": 28927718.0, + "step": 1121 + }, + { + "epoch": 0.123215462332528, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 2.368353843688965, + "learning_rate": 4.103221083455344e-07, + "loss": 1.1568, + "mean_token_accuracy": 0.6614425182342529, + "num_tokens": 28956062.0, + "step": 1122 + }, + { + "epoch": 0.12332528003514166, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 2.461606740951538, + "learning_rate": 4.1068814055636894e-07, + "loss": 1.0984, + "mean_token_accuracy": 0.6812334060668945, + "num_tokens": 28980636.0, + "step": 1123 + }, + { + "epoch": 0.12343509773775532, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 2.468437671661377, + "learning_rate": 4.110541727672035e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6636384725570679, + "num_tokens": 29005838.0, + "step": 1124 + }, + { + "epoch": 0.12354491544036898, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 2.3848471641540527, + "learning_rate": 4.1142020497803804e-07, + "loss": 1.1426, + "mean_token_accuracy": 0.6634668707847595, + "num_tokens": 29032584.0, + "step": 1125 + }, + { + "epoch": 0.12365473314298264, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 2.6261608600616455, + "learning_rate": 4.1178623718887264e-07, + "loss": 1.0776, + "mean_token_accuracy": 0.6830564141273499, + "num_tokens": 29054249.0, + "step": 1126 + }, + { + "epoch": 0.12376455084559632, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 2.2204601764678955, + "learning_rate": 4.1215226939970714e-07, + "loss": 1.0944, + "mean_token_accuracy": 0.6772251725196838, + "num_tokens": 29083825.0, + "step": 1127 + }, + { + "epoch": 0.12387436854820998, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 2.2096750736236572, + "learning_rate": 4.125183016105417e-07, + "loss": 1.1008, + "mean_token_accuracy": 0.6860437393188477, + "num_tokens": 29113619.0, + "step": 1128 + }, + { + "epoch": 0.12398418625082364, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.336824417114258, + "learning_rate": 4.128843338213763e-07, + "loss": 1.1415, + "mean_token_accuracy": 0.6705937385559082, + "num_tokens": 29141354.0, + "step": 1129 + }, + { + "epoch": 0.1240940039534373, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.5969419479370117, + "learning_rate": 4.132503660322108e-07, + "loss": 1.0814, + "mean_token_accuracy": 0.6787924766540527, + "num_tokens": 29163113.0, + "step": 1130 + }, + { + "epoch": 0.12420382165605096, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.317385196685791, + "learning_rate": 4.136163982430454e-07, + "loss": 1.0374, + "mean_token_accuracy": 0.6930023431777954, + "num_tokens": 29190986.0, + "step": 1131 + }, + { + "epoch": 0.12431363935866462, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.224607229232788, + "learning_rate": 4.1398243045387994e-07, + "loss": 0.9697, + "mean_token_accuracy": 0.7106756567955017, + "num_tokens": 29218126.0, + "step": 1132 + }, + { + "epoch": 0.12442345706127828, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.341233253479004, + "learning_rate": 4.143484626647145e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.6769787073135376, + "num_tokens": 29246177.0, + "step": 1133 + }, + { + "epoch": 0.12453327476389194, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.5107266902923584, + "learning_rate": 4.1471449487554904e-07, + "loss": 1.0548, + "mean_token_accuracy": 0.6964361667633057, + "num_tokens": 29270132.0, + "step": 1134 + }, + { + "epoch": 0.1246430924665056, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.286708116531372, + "learning_rate": 4.150805270863836e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.6688880920410156, + "num_tokens": 29298680.0, + "step": 1135 + }, + { + "epoch": 0.12475291016911927, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 2.6910512447357178, + "learning_rate": 4.1544655929721813e-07, + "loss": 1.0282, + "mean_token_accuracy": 0.6971268653869629, + "num_tokens": 29319341.0, + "step": 1136 + }, + { + "epoch": 0.12486272787173293, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 2.4100804328918457, + "learning_rate": 4.158125915080527e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.6970105171203613, + "num_tokens": 29344645.0, + "step": 1137 + }, + { + "epoch": 0.12497254557434659, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 2.308871030807495, + "learning_rate": 4.1617862371888723e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.6915522217750549, + "num_tokens": 29370995.0, + "step": 1138 + }, + { + "epoch": 0.12508236327696023, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 2.4299473762512207, + "learning_rate": 4.1654465592972183e-07, + "loss": 1.1546, + "mean_token_accuracy": 0.6664213538169861, + "num_tokens": 29396692.0, + "step": 1139 + }, + { + "epoch": 0.1251921809795739, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 2.247905969619751, + "learning_rate": 4.1691068814055633e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.6724361777305603, + "num_tokens": 29426669.0, + "step": 1140 + }, + { + "epoch": 0.12530199868218758, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 2.4012672901153564, + "learning_rate": 4.172767203513909e-07, + "loss": 1.0113, + "mean_token_accuracy": 0.6992146968841553, + "num_tokens": 29451681.0, + "step": 1141 + }, + { + "epoch": 0.12541181638480123, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 2.0802507400512695, + "learning_rate": 4.176427525622255e-07, + "loss": 1.1378, + "mean_token_accuracy": 0.6716638207435608, + "num_tokens": 29484702.0, + "step": 1142 + }, + { + "epoch": 0.1255216340874149, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 2.6966726779937744, + "learning_rate": 4.1800878477306003e-07, + "loss": 0.9863, + "mean_token_accuracy": 0.7038823366165161, + "num_tokens": 29504341.0, + "step": 1143 + }, + { + "epoch": 0.12563145179002855, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 2.3220369815826416, + "learning_rate": 4.183748169838945e-07, + "loss": 1.0162, + "mean_token_accuracy": 0.6908707618713379, + "num_tokens": 29530130.0, + "step": 1144 + }, + { + "epoch": 0.12574126949264222, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 2.211583137512207, + "learning_rate": 4.1874084919472913e-07, + "loss": 1.1159, + "mean_token_accuracy": 0.6728332042694092, + "num_tokens": 29559283.0, + "step": 1145 + }, + { + "epoch": 0.12585108719525587, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 2.451235771179199, + "learning_rate": 4.191068814055637e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6758227944374084, + "num_tokens": 29584911.0, + "step": 1146 + }, + { + "epoch": 0.12596090489786954, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 2.4006707668304443, + "learning_rate": 4.194729136163982e-07, + "loss": 0.995, + "mean_token_accuracy": 0.7021396160125732, + "num_tokens": 29610399.0, + "step": 1147 + }, + { + "epoch": 0.1260707226004832, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 2.5435385704040527, + "learning_rate": 4.198389458272328e-07, + "loss": 0.9644, + "mean_token_accuracy": 0.707448422908783, + "num_tokens": 29633032.0, + "step": 1148 + }, + { + "epoch": 0.12618054030309686, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 2.4396233558654785, + "learning_rate": 4.202049780380673e-07, + "loss": 1.0745, + "mean_token_accuracy": 0.6805081367492676, + "num_tokens": 29657214.0, + "step": 1149 + }, + { + "epoch": 0.1262903580057105, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 2.162240743637085, + "learning_rate": 4.205710102489019e-07, + "loss": 1.0816, + "mean_token_accuracy": 0.6831003427505493, + "num_tokens": 29688013.0, + "step": 1150 + }, + { + "epoch": 0.12640017570832418, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 2.6276466846466064, + "learning_rate": 4.209370424597364e-07, + "loss": 1.0193, + "mean_token_accuracy": 0.6949760913848877, + "num_tokens": 29710451.0, + "step": 1151 + }, + { + "epoch": 0.12650999341093785, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 2.4601938724517822, + "learning_rate": 4.2130307467057097e-07, + "loss": 1.0941, + "mean_token_accuracy": 0.6776758432388306, + "num_tokens": 29735654.0, + "step": 1152 + }, + { + "epoch": 0.1266198111135515, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.456265449523926, + "learning_rate": 4.2166910688140557e-07, + "loss": 1.1427, + "mean_token_accuracy": 0.6614183783531189, + "num_tokens": 29764641.0, + "step": 1153 + }, + { + "epoch": 0.12672962881616517, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.484131097793579, + "learning_rate": 4.2203513909224007e-07, + "loss": 1.1523, + "mean_token_accuracy": 0.6694801449775696, + "num_tokens": 29790458.0, + "step": 1154 + }, + { + "epoch": 0.12683944651877882, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.4688289165496826, + "learning_rate": 4.2240117130307467e-07, + "loss": 1.0588, + "mean_token_accuracy": 0.6876674294471741, + "num_tokens": 29816208.0, + "step": 1155 + }, + { + "epoch": 0.1269492642213925, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.4639840126037598, + "learning_rate": 4.227672035139092e-07, + "loss": 1.0906, + "mean_token_accuracy": 0.6879644393920898, + "num_tokens": 29840722.0, + "step": 1156 + }, + { + "epoch": 0.12705908192400614, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.373417854309082, + "learning_rate": 4.231332357247437e-07, + "loss": 1.1813, + "mean_token_accuracy": 0.6575603485107422, + "num_tokens": 29869138.0, + "step": 1157 + }, + { + "epoch": 0.1271688996266198, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 2.5666604042053223, + "learning_rate": 4.234992679355783e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6852313280105591, + "num_tokens": 29891804.0, + "step": 1158 + }, + { + "epoch": 0.12727871732923346, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 2.3263492584228516, + "learning_rate": 4.2386530014641287e-07, + "loss": 1.1396, + "mean_token_accuracy": 0.674393892288208, + "num_tokens": 29918368.0, + "step": 1159 + }, + { + "epoch": 0.12738853503184713, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.3008828163146973, + "learning_rate": 4.242313323572474e-07, + "loss": 1.1009, + "mean_token_accuracy": 0.6827174425125122, + "num_tokens": 29947243.0, + "step": 1160 + }, + { + "epoch": 0.1274983527344608, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.2974469661712646, + "learning_rate": 4.2459736456808197e-07, + "loss": 1.1323, + "mean_token_accuracy": 0.6710953116416931, + "num_tokens": 29976185.0, + "step": 1161 + }, + { + "epoch": 0.12760817043707445, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.3386895656585693, + "learning_rate": 4.249633967789165e-07, + "loss": 1.0991, + "mean_token_accuracy": 0.6756229400634766, + "num_tokens": 30004893.0, + "step": 1162 + }, + { + "epoch": 0.12771798813968813, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.212602376937866, + "learning_rate": 4.253294289897511e-07, + "loss": 1.1904, + "mean_token_accuracy": 0.6566011309623718, + "num_tokens": 30033645.0, + "step": 1163 + }, + { + "epoch": 0.12782780584230177, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.5487704277038574, + "learning_rate": 4.256954612005856e-07, + "loss": 1.0912, + "mean_token_accuracy": 0.6841570138931274, + "num_tokens": 30055734.0, + "step": 1164 + }, + { + "epoch": 0.12793762354491545, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.602264881134033, + "learning_rate": 4.2606149341142016e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6939767003059387, + "num_tokens": 30076609.0, + "step": 1165 + }, + { + "epoch": 0.1280474412475291, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.598437547683716, + "learning_rate": 4.2642752562225476e-07, + "loss": 1.1292, + "mean_token_accuracy": 0.6635171175003052, + "num_tokens": 30101695.0, + "step": 1166 + }, + { + "epoch": 0.12815725895014277, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 2.456725597381592, + "learning_rate": 4.267935578330893e-07, + "loss": 1.1357, + "mean_token_accuracy": 0.6723402142524719, + "num_tokens": 30128385.0, + "step": 1167 + }, + { + "epoch": 0.1282670766527564, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 2.594184398651123, + "learning_rate": 4.271595900439238e-07, + "loss": 0.9168, + "mean_token_accuracy": 0.7277756929397583, + "num_tokens": 30149434.0, + "step": 1168 + }, + { + "epoch": 0.12837689435537009, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 2.3237717151641846, + "learning_rate": 4.275256222547584e-07, + "loss": 1.0273, + "mean_token_accuracy": 0.6960698962211609, + "num_tokens": 30175072.0, + "step": 1169 + }, + { + "epoch": 0.12848671205798376, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.26885724067688, + "learning_rate": 4.2789165446559296e-07, + "loss": 1.086, + "mean_token_accuracy": 0.6837741136550903, + "num_tokens": 30204221.0, + "step": 1170 + }, + { + "epoch": 0.1285965297605974, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.245847225189209, + "learning_rate": 4.282576866764275e-07, + "loss": 1.1075, + "mean_token_accuracy": 0.6754711866378784, + "num_tokens": 30232624.0, + "step": 1171 + }, + { + "epoch": 0.12870634746321108, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.3285439014434814, + "learning_rate": 4.2862371888726206e-07, + "loss": 1.091, + "mean_token_accuracy": 0.6854227781295776, + "num_tokens": 30259309.0, + "step": 1172 + }, + { + "epoch": 0.12881616516582473, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.3843801021575928, + "learning_rate": 4.289897510980966e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.6852476000785828, + "num_tokens": 30286533.0, + "step": 1173 + }, + { + "epoch": 0.1289259828684384, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.4860618114471436, + "learning_rate": 4.293557833089312e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6880244016647339, + "num_tokens": 30311150.0, + "step": 1174 + }, + { + "epoch": 0.12903580057105205, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.5112664699554443, + "learning_rate": 4.297218155197657e-07, + "loss": 1.0134, + "mean_token_accuracy": 0.6956517696380615, + "num_tokens": 30334858.0, + "step": 1175 + }, + { + "epoch": 0.12914561827366572, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.5923781394958496, + "learning_rate": 4.3008784773060026e-07, + "loss": 0.9901, + "mean_token_accuracy": 0.7095503211021423, + "num_tokens": 30358502.0, + "step": 1176 + }, + { + "epoch": 0.12925543597627936, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.3868966102600098, + "learning_rate": 4.3045387994143486e-07, + "loss": 1.1211, + "mean_token_accuracy": 0.6700190305709839, + "num_tokens": 30384783.0, + "step": 1177 + }, + { + "epoch": 0.12936525367889304, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 2.4547009468078613, + "learning_rate": 4.3081991215226935e-07, + "loss": 1.0915, + "mean_token_accuracy": 0.6901086568832397, + "num_tokens": 30408913.0, + "step": 1178 + }, + { + "epoch": 0.1294750713815067, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.4611802101135254, + "learning_rate": 4.3118594436310396e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.6944179534912109, + "num_tokens": 30433900.0, + "step": 1179 + }, + { + "epoch": 0.12958488908412036, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.552687644958496, + "learning_rate": 4.315519765739385e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.7030351758003235, + "num_tokens": 30461190.0, + "step": 1180 + }, + { + "epoch": 0.12969470678673403, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.471646785736084, + "learning_rate": 4.31918008784773e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6746035814285278, + "num_tokens": 30489651.0, + "step": 1181 + }, + { + "epoch": 0.12980452448934768, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.46073842048645, + "learning_rate": 4.322840409956076e-07, + "loss": 1.0851, + "mean_token_accuracy": 0.693814218044281, + "num_tokens": 30519052.0, + "step": 1182 + }, + { + "epoch": 0.12991434219196135, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.3511223793029785, + "learning_rate": 4.3265007320644215e-07, + "loss": 0.9925, + "mean_token_accuracy": 0.703625500202179, + "num_tokens": 30543441.0, + "step": 1183 + }, + { + "epoch": 0.130024159894575, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.527458667755127, + "learning_rate": 4.330161054172767e-07, + "loss": 1.1406, + "mean_token_accuracy": 0.669806182384491, + "num_tokens": 30567401.0, + "step": 1184 + }, + { + "epoch": 0.13013397759718867, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.444149971008301, + "learning_rate": 4.3338213762811125e-07, + "loss": 1.1392, + "mean_token_accuracy": 0.6668174862861633, + "num_tokens": 30594937.0, + "step": 1185 + }, + { + "epoch": 0.13024379529980232, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.2848424911499023, + "learning_rate": 4.337481698389458e-07, + "loss": 0.9921, + "mean_token_accuracy": 0.7077285051345825, + "num_tokens": 30623647.0, + "step": 1186 + }, + { + "epoch": 0.130353613002416, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.418247699737549, + "learning_rate": 4.341142020497804e-07, + "loss": 1.1011, + "mean_token_accuracy": 0.6821117997169495, + "num_tokens": 30650386.0, + "step": 1187 + }, + { + "epoch": 0.13046343070502964, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 2.655144214630127, + "learning_rate": 4.344802342606149e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.6809411644935608, + "num_tokens": 30673298.0, + "step": 1188 + }, + { + "epoch": 0.1305732484076433, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 2.2307403087615967, + "learning_rate": 4.3484626647144945e-07, + "loss": 1.0999, + "mean_token_accuracy": 0.6840146780014038, + "num_tokens": 30702162.0, + "step": 1189 + }, + { + "epoch": 0.13068306611025698, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 2.4850339889526367, + "learning_rate": 4.3521229868228405e-07, + "loss": 1.1293, + "mean_token_accuracy": 0.6749814748764038, + "num_tokens": 30728125.0, + "step": 1190 + }, + { + "epoch": 0.13079288381287063, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 2.3849070072174072, + "learning_rate": 4.355783308931186e-07, + "loss": 1.0092, + "mean_token_accuracy": 0.7006295919418335, + "num_tokens": 30753050.0, + "step": 1191 + }, + { + "epoch": 0.1309027015154843, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 2.43354868888855, + "learning_rate": 4.359443631039531e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6874043345451355, + "num_tokens": 30778703.0, + "step": 1192 + }, + { + "epoch": 0.13101251921809795, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 2.1351916790008545, + "learning_rate": 4.363103953147877e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6865620613098145, + "num_tokens": 30810814.0, + "step": 1193 + }, + { + "epoch": 0.13112233692071162, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.3812217712402344, + "learning_rate": 4.3667642752562224e-07, + "loss": 1.0657, + "mean_token_accuracy": 0.6983139514923096, + "num_tokens": 30836427.0, + "step": 1194 + }, + { + "epoch": 0.13123215462332527, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.3079586029052734, + "learning_rate": 4.370424597364568e-07, + "loss": 1.131, + "mean_token_accuracy": 0.6680768728256226, + "num_tokens": 30865532.0, + "step": 1195 + }, + { + "epoch": 0.13134197232593894, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.75722074508667, + "learning_rate": 4.3740849194729134e-07, + "loss": 1.0537, + "mean_token_accuracy": 0.6849038600921631, + "num_tokens": 30885416.0, + "step": 1196 + }, + { + "epoch": 0.1314517900285526, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 2.454338312149048, + "learning_rate": 4.377745241581259e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.6906623244285583, + "num_tokens": 30909752.0, + "step": 1197 + }, + { + "epoch": 0.13156160773116626, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 2.8464653491973877, + "learning_rate": 4.381405563689605e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.7019533514976501, + "num_tokens": 30928463.0, + "step": 1198 + }, + { + "epoch": 0.13167142543377994, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.700406789779663, + "learning_rate": 4.38506588579795e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.6891085505485535, + "num_tokens": 30950612.0, + "step": 1199 + }, + { + "epoch": 0.13178124313639358, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.3345110416412354, + "learning_rate": 4.3887262079062954e-07, + "loss": 1.0489, + "mean_token_accuracy": 0.6979016661643982, + "num_tokens": 30975408.0, + "step": 1200 + }, + { + "epoch": 0.13189106083900726, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.518448829650879, + "learning_rate": 4.3923865300146414e-07, + "loss": 0.9574, + "mean_token_accuracy": 0.7125113010406494, + "num_tokens": 30998382.0, + "step": 1201 + }, + { + "epoch": 0.1320008785416209, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.226698637008667, + "learning_rate": 4.3960468521229864e-07, + "loss": 1.0112, + "mean_token_accuracy": 0.7024608254432678, + "num_tokens": 31024645.0, + "step": 1202 + }, + { + "epoch": 0.13211069624423458, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.4390745162963867, + "learning_rate": 4.3997071742313324e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6931812167167664, + "num_tokens": 31050392.0, + "step": 1203 + }, + { + "epoch": 0.13222051394684822, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.5577774047851562, + "learning_rate": 4.403367496339678e-07, + "loss": 1.0031, + "mean_token_accuracy": 0.70209801197052, + "num_tokens": 31072099.0, + "step": 1204 + }, + { + "epoch": 0.1323303316494619, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.2994935512542725, + "learning_rate": 4.407027818448023e-07, + "loss": 1.0386, + "mean_token_accuracy": 0.689433217048645, + "num_tokens": 31099348.0, + "step": 1205 + }, + { + "epoch": 0.13244014935207554, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 8.570137977600098, + "learning_rate": 4.410688140556369e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.6910667419433594, + "num_tokens": 31127674.0, + "step": 1206 + }, + { + "epoch": 0.13254996705468922, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.7604384422302246, + "learning_rate": 4.4143484626647144e-07, + "loss": 1.0307, + "mean_token_accuracy": 0.7023890614509583, + "num_tokens": 31147826.0, + "step": 1207 + }, + { + "epoch": 0.1326597847573029, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.754530191421509, + "learning_rate": 4.41800878477306e-07, + "loss": 0.9781, + "mean_token_accuracy": 0.7055131793022156, + "num_tokens": 31168192.0, + "step": 1208 + }, + { + "epoch": 0.13276960245991654, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 2.7778947353363037, + "learning_rate": 4.4216691068814053e-07, + "loss": 0.8716, + "mean_token_accuracy": 0.724640965461731, + "num_tokens": 31186404.0, + "step": 1209 + }, + { + "epoch": 0.1328794201625302, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.2478718757629395, + "learning_rate": 4.425329428989751e-07, + "loss": 1.1261, + "mean_token_accuracy": 0.6654337644577026, + "num_tokens": 31216377.0, + "step": 1210 + }, + { + "epoch": 0.13298923786514386, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 2.556464672088623, + "learning_rate": 4.428989751098097e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6825848817825317, + "num_tokens": 31241083.0, + "step": 1211 + }, + { + "epoch": 0.13309905556775753, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.4718031883239746, + "learning_rate": 4.432650073206442e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6756521463394165, + "num_tokens": 31269362.0, + "step": 1212 + }, + { + "epoch": 0.13320887327037118, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.0622951984405518, + "learning_rate": 4.4363103953147873e-07, + "loss": 1.0147, + "mean_token_accuracy": 0.6965640783309937, + "num_tokens": 31301121.0, + "step": 1213 + }, + { + "epoch": 0.13331869097298485, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.690519094467163, + "learning_rate": 4.4399707174231333e-07, + "loss": 1.113, + "mean_token_accuracy": 0.6679918766021729, + "num_tokens": 31323850.0, + "step": 1214 + }, + { + "epoch": 0.1334285086755985, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.435279369354248, + "learning_rate": 4.443631039531479e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6851279735565186, + "num_tokens": 31349231.0, + "step": 1215 + }, + { + "epoch": 0.13353832637821217, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 2.7993295192718506, + "learning_rate": 4.447291361639824e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.67723548412323, + "num_tokens": 31368491.0, + "step": 1216 + }, + { + "epoch": 0.13364814408082584, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 2.4044008255004883, + "learning_rate": 4.45095168374817e-07, + "loss": 0.9755, + "mean_token_accuracy": 0.7041652202606201, + "num_tokens": 31392465.0, + "step": 1217 + }, + { + "epoch": 0.1337579617834395, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.4957306385040283, + "learning_rate": 4.4546120058565153e-07, + "loss": 1.0804, + "mean_token_accuracy": 0.6766594648361206, + "num_tokens": 31417587.0, + "step": 1218 + }, + { + "epoch": 0.13386777948605316, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.160050392150879, + "learning_rate": 4.458272327964861e-07, + "loss": 1.0344, + "mean_token_accuracy": 0.6952624917030334, + "num_tokens": 31448313.0, + "step": 1219 + }, + { + "epoch": 0.1339775971886668, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.5494749546051025, + "learning_rate": 4.4619326500732063e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6893872618675232, + "num_tokens": 31472179.0, + "step": 1220 + }, + { + "epoch": 0.13408741489128048, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 2.2249443531036377, + "learning_rate": 4.465592972181552e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6859596967697144, + "num_tokens": 31501837.0, + "step": 1221 + }, + { + "epoch": 0.13419723259389413, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 2.563246488571167, + "learning_rate": 4.469253294289897e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.6983332633972168, + "num_tokens": 31523276.0, + "step": 1222 + }, + { + "epoch": 0.1343070502965078, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.566653251647949, + "learning_rate": 4.472913616398243e-07, + "loss": 1.167, + "mean_token_accuracy": 0.6643295288085938, + "num_tokens": 31547514.0, + "step": 1223 + }, + { + "epoch": 0.13441686799912145, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.2445573806762695, + "learning_rate": 4.476573938506588e-07, + "loss": 1.0007, + "mean_token_accuracy": 0.6996864676475525, + "num_tokens": 31575007.0, + "step": 1224 + }, + { + "epoch": 0.13452668570173512, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.401482343673706, + "learning_rate": 4.480234260614934e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6881453990936279, + "num_tokens": 31600483.0, + "step": 1225 + }, + { + "epoch": 0.13463650340434877, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.546297550201416, + "learning_rate": 4.483894582723279e-07, + "loss": 1.0331, + "mean_token_accuracy": 0.6982389092445374, + "num_tokens": 31622639.0, + "step": 1226 + }, + { + "epoch": 0.13474632110696244, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.5794239044189453, + "learning_rate": 4.487554904831625e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6992866396903992, + "num_tokens": 31647351.0, + "step": 1227 + }, + { + "epoch": 0.13485613880957611, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 2.77601957321167, + "learning_rate": 4.4912152269399707e-07, + "loss": 1.1136, + "mean_token_accuracy": 0.674998939037323, + "num_tokens": 31667220.0, + "step": 1228 + }, + { + "epoch": 0.13496595651218976, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 2.6229889392852783, + "learning_rate": 4.4948755490483157e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.6784208416938782, + "num_tokens": 31689044.0, + "step": 1229 + }, + { + "epoch": 0.13507577421480343, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 2.4395596981048584, + "learning_rate": 4.4985358711566617e-07, + "loss": 1.2557, + "mean_token_accuracy": 0.6456613540649414, + "num_tokens": 31717794.0, + "step": 1230 + }, + { + "epoch": 0.13518559191741708, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 2.4273104667663574, + "learning_rate": 4.502196193265007e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6722157001495361, + "num_tokens": 31743349.0, + "step": 1231 + }, + { + "epoch": 0.13529540962003075, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.9064371585845947, + "learning_rate": 4.5058565153733527e-07, + "loss": 1.003, + "mean_token_accuracy": 0.6956473588943481, + "num_tokens": 31761698.0, + "step": 1232 + }, + { + "epoch": 0.1354052273226444, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.926054000854492, + "learning_rate": 4.509516837481698e-07, + "loss": 0.998, + "mean_token_accuracy": 0.7065445780754089, + "num_tokens": 31780060.0, + "step": 1233 + }, + { + "epoch": 0.13551504502525807, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.206328868865967, + "learning_rate": 4.5131771595900437e-07, + "loss": 1.0126, + "mean_token_accuracy": 0.7006675004959106, + "num_tokens": 31810081.0, + "step": 1234 + }, + { + "epoch": 0.13562486272787172, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.3359339237213135, + "learning_rate": 4.5168374816983897e-07, + "loss": 1.08, + "mean_token_accuracy": 0.6811800003051758, + "num_tokens": 31838983.0, + "step": 1235 + }, + { + "epoch": 0.1357346804304854, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.3889167308807373, + "learning_rate": 4.5204978038067347e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6879485845565796, + "num_tokens": 31864058.0, + "step": 1236 + }, + { + "epoch": 0.13584449813309907, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 2.463827610015869, + "learning_rate": 4.52415812591508e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.6835947036743164, + "num_tokens": 31889614.0, + "step": 1237 + }, + { + "epoch": 0.1359543158357127, + "ewc_loss": 3.0100345611572266e-06, + "grad_norm": 2.4258882999420166, + "learning_rate": 4.527818448023426e-07, + "loss": 1.1703, + "mean_token_accuracy": 0.6698732376098633, + "num_tokens": 31915897.0, + "step": 1238 + }, + { + "epoch": 0.1360641335383264, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.3782947063446045, + "learning_rate": 4.531478770131771e-07, + "loss": 1.0235, + "mean_token_accuracy": 0.6920495629310608, + "num_tokens": 31943588.0, + "step": 1239 + }, + { + "epoch": 0.13617395124094003, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.391444683074951, + "learning_rate": 4.5351390922401166e-07, + "loss": 1.0854, + "mean_token_accuracy": 0.6843950748443604, + "num_tokens": 31970532.0, + "step": 1240 + }, + { + "epoch": 0.1362837689435537, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.5014090538024902, + "learning_rate": 4.5387994143484626e-07, + "loss": 1.05, + "mean_token_accuracy": 0.6958311796188354, + "num_tokens": 31996697.0, + "step": 1241 + }, + { + "epoch": 0.13639358664616735, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.6759934425354004, + "learning_rate": 4.542459736456808e-07, + "loss": 1.0035, + "mean_token_accuracy": 0.7054644823074341, + "num_tokens": 32018303.0, + "step": 1242 + }, + { + "epoch": 0.13650340434878103, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.246197462081909, + "learning_rate": 4.5461200585651536e-07, + "loss": 1.126, + "mean_token_accuracy": 0.6666111946105957, + "num_tokens": 32046571.0, + "step": 1243 + }, + { + "epoch": 0.13661322205139467, + "ewc_loss": 3.0547380447387695e-06, + "grad_norm": 2.510282278060913, + "learning_rate": 4.549780380673499e-07, + "loss": 1.1499, + "mean_token_accuracy": 0.6682723760604858, + "num_tokens": 32072281.0, + "step": 1244 + }, + { + "epoch": 0.13672303975400835, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 2.4650893211364746, + "learning_rate": 4.5534407027818446e-07, + "loss": 1.1866, + "mean_token_accuracy": 0.6596579551696777, + "num_tokens": 32099451.0, + "step": 1245 + }, + { + "epoch": 0.13683285745662202, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 2.440004348754883, + "learning_rate": 4.55710102489019e-07, + "loss": 1.0516, + "mean_token_accuracy": 0.6924762725830078, + "num_tokens": 32124080.0, + "step": 1246 + }, + { + "epoch": 0.13694267515923567, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 2.324678659439087, + "learning_rate": 4.5607613469985356e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6627686619758606, + "num_tokens": 32153434.0, + "step": 1247 + }, + { + "epoch": 0.13705249286184934, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 2.5241472721099854, + "learning_rate": 4.564421669106881e-07, + "loss": 1.0293, + "mean_token_accuracy": 0.6968051195144653, + "num_tokens": 32176510.0, + "step": 1248 + }, + { + "epoch": 0.13716231056446299, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 2.138612747192383, + "learning_rate": 4.568081991215227e-07, + "loss": 1.0817, + "mean_token_accuracy": 0.6787334680557251, + "num_tokens": 32206030.0, + "step": 1249 + }, + { + "epoch": 0.13727212826707666, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 2.332185745239258, + "learning_rate": 4.571742313323572e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6915877461433411, + "num_tokens": 32233255.0, + "step": 1250 + }, + { + "epoch": 0.1373819459696903, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.401125907897949, + "learning_rate": 4.575402635431918e-07, + "loss": 1.0484, + "mean_token_accuracy": 0.6938602924346924, + "num_tokens": 32257818.0, + "step": 1251 + }, + { + "epoch": 0.13749176367230398, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.2999160289764404, + "learning_rate": 4.5790629575402636e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6887702941894531, + "num_tokens": 32286826.0, + "step": 1252 + }, + { + "epoch": 0.13760158137491763, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.5075058937072754, + "learning_rate": 4.5827232796486085e-07, + "loss": 1.061, + "mean_token_accuracy": 0.6863299608230591, + "num_tokens": 32310447.0, + "step": 1253 + }, + { + "epoch": 0.1377113990775313, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.410942554473877, + "learning_rate": 4.5863836017569545e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.6902700066566467, + "num_tokens": 32335713.0, + "step": 1254 + }, + { + "epoch": 0.13782121678014497, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.365311861038208, + "learning_rate": 4.5900439238653e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.6890829801559448, + "num_tokens": 32361259.0, + "step": 1255 + }, + { + "epoch": 0.13793103448275862, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.679006338119507, + "learning_rate": 4.593704245973645e-07, + "loss": 1.0568, + "mean_token_accuracy": 0.6874042749404907, + "num_tokens": 32383138.0, + "step": 1256 + }, + { + "epoch": 0.1380408521853723, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.3638670444488525, + "learning_rate": 4.597364568081991e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.6862573027610779, + "num_tokens": 32409879.0, + "step": 1257 + }, + { + "epoch": 0.13815066988798594, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 3.0306951999664307, + "learning_rate": 4.6010248901903365e-07, + "loss": 1.0378, + "mean_token_accuracy": 0.6898477077484131, + "num_tokens": 32430039.0, + "step": 1258 + }, + { + "epoch": 0.1382604875905996, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.381232976913452, + "learning_rate": 4.6046852122986825e-07, + "loss": 1.0406, + "mean_token_accuracy": 0.6903308629989624, + "num_tokens": 32456043.0, + "step": 1259 + }, + { + "epoch": 0.13837030529321326, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.104218006134033, + "learning_rate": 4.6083455344070275e-07, + "loss": 1.1491, + "mean_token_accuracy": 0.6644648313522339, + "num_tokens": 32489703.0, + "step": 1260 + }, + { + "epoch": 0.13848012299582693, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.037663459777832, + "learning_rate": 4.612005856515373e-07, + "loss": 1.1028, + "mean_token_accuracy": 0.6748650074005127, + "num_tokens": 32523481.0, + "step": 1261 + }, + { + "epoch": 0.13858994069844058, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.722519636154175, + "learning_rate": 4.615666178623719e-07, + "loss": 1.0389, + "mean_token_accuracy": 0.6949074268341064, + "num_tokens": 32544230.0, + "step": 1262 + }, + { + "epoch": 0.13869975840105425, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 2.7319886684417725, + "learning_rate": 4.619326500732064e-07, + "loss": 1.0111, + "mean_token_accuracy": 0.6971772313117981, + "num_tokens": 32563592.0, + "step": 1263 + }, + { + "epoch": 0.1388095761036679, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.348574638366699, + "learning_rate": 4.6229868228404095e-07, + "loss": 1.1671, + "mean_token_accuracy": 0.6716184616088867, + "num_tokens": 32592096.0, + "step": 1264 + }, + { + "epoch": 0.13891939380628157, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.4712460041046143, + "learning_rate": 4.6266471449487555e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.6908209323883057, + "num_tokens": 32615477.0, + "step": 1265 + }, + { + "epoch": 0.13902921150889525, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.625481605529785, + "learning_rate": 4.630307467057101e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.6918429136276245, + "num_tokens": 32637654.0, + "step": 1266 + }, + { + "epoch": 0.1391390292115089, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.4703404903411865, + "learning_rate": 4.6339677891654465e-07, + "loss": 1.008, + "mean_token_accuracy": 0.6984488368034363, + "num_tokens": 32661199.0, + "step": 1267 + }, + { + "epoch": 0.13924884691412256, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.637628111273792e-07, + "loss": 1.1026, + "mean_token_accuracy": 0.6753121018409729, + "num_tokens": 32684302.0, + "step": 1268 + }, + { + "epoch": 0.1393586646167362, + "ewc_loss": 3.0994415283203125e-06, + "grad_norm": 2.2386398315429688, + "learning_rate": 4.6412884333821374e-07, + "loss": 1.1381, + "mean_token_accuracy": 0.6660317778587341, + "num_tokens": 32711546.0, + "step": 1269 + }, + { + "epoch": 0.13946848231934988, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.2325353622436523, + "learning_rate": 4.644948755490483e-07, + "loss": 1.1531, + "mean_token_accuracy": 0.6640005111694336, + "num_tokens": 32741165.0, + "step": 1270 + }, + { + "epoch": 0.13957830002196353, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.153064727783203, + "learning_rate": 4.6486090775988284e-07, + "loss": 1.0656, + "mean_token_accuracy": 0.68660569190979, + "num_tokens": 32774701.0, + "step": 1271 + }, + { + "epoch": 0.1396881177245772, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.548788547515869, + "learning_rate": 4.652269399707174e-07, + "loss": 1.1525, + "mean_token_accuracy": 0.6658392548561096, + "num_tokens": 32798129.0, + "step": 1272 + }, + { + "epoch": 0.13979793542719085, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.157703161239624, + "learning_rate": 4.65592972181552e-07, + "loss": 1.0695, + "mean_token_accuracy": 0.689661979675293, + "num_tokens": 32829323.0, + "step": 1273 + }, + { + "epoch": 0.13990775312980452, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.300678014755249, + "learning_rate": 4.659590043923865e-07, + "loss": 0.994, + "mean_token_accuracy": 0.70560622215271, + "num_tokens": 32856609.0, + "step": 1274 + }, + { + "epoch": 0.1400175708324182, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.441718816757202, + "learning_rate": 4.663250366032211e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6860278844833374, + "num_tokens": 32881060.0, + "step": 1275 + }, + { + "epoch": 0.14012738853503184, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.406216859817505, + "learning_rate": 4.6669106881405564e-07, + "loss": 1.0128, + "mean_token_accuracy": 0.7123316526412964, + "num_tokens": 32904678.0, + "step": 1276 + }, + { + "epoch": 0.14023720623764552, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.592991352081299, + "learning_rate": 4.6705710102489014e-07, + "loss": 1.0228, + "mean_token_accuracy": 0.6900532245635986, + "num_tokens": 32928285.0, + "step": 1277 + }, + { + "epoch": 0.14034702394025916, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 2.5104305744171143, + "learning_rate": 4.6742313323572474e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6912648677825928, + "num_tokens": 32953717.0, + "step": 1278 + }, + { + "epoch": 0.14045684164287284, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.0704026222229004, + "learning_rate": 4.677891654465593e-07, + "loss": 1.0315, + "mean_token_accuracy": 0.695594310760498, + "num_tokens": 32985152.0, + "step": 1279 + }, + { + "epoch": 0.14056665934548648, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.5249366760253906, + "learning_rate": 4.681551976573938e-07, + "loss": 1.0465, + "mean_token_accuracy": 0.6897251009941101, + "num_tokens": 33009420.0, + "step": 1280 + }, + { + "epoch": 0.14067647704810016, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.2614428997039795, + "learning_rate": 4.685212298682284e-07, + "loss": 1.1755, + "mean_token_accuracy": 0.6539168357849121, + "num_tokens": 33042302.0, + "step": 1281 + }, + { + "epoch": 0.1407862947507138, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.1853160858154297, + "learning_rate": 4.6888726207906293e-07, + "loss": 1.0678, + "mean_token_accuracy": 0.6826146841049194, + "num_tokens": 33072431.0, + "step": 1282 + }, + { + "epoch": 0.14089611245332748, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.3777332305908203, + "learning_rate": 4.6925329428989754e-07, + "loss": 1.1126, + "mean_token_accuracy": 0.6690269708633423, + "num_tokens": 33098031.0, + "step": 1283 + }, + { + "epoch": 0.14100593015594115, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.4773452281951904, + "learning_rate": 4.6961932650073203e-07, + "loss": 1.0965, + "mean_token_accuracy": 0.6847485303878784, + "num_tokens": 33121574.0, + "step": 1284 + }, + { + "epoch": 0.1411157478585548, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 2.568775177001953, + "learning_rate": 4.699853587115666e-07, + "loss": 1.1096, + "mean_token_accuracy": 0.6813251376152039, + "num_tokens": 33145414.0, + "step": 1285 + }, + { + "epoch": 0.14122556556116847, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.555649995803833, + "learning_rate": 4.703513909224012e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.6990700960159302, + "num_tokens": 33166555.0, + "step": 1286 + }, + { + "epoch": 0.14133538326378212, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.0436699390411377, + "learning_rate": 4.707174231332357e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.669840931892395, + "num_tokens": 33200074.0, + "step": 1287 + }, + { + "epoch": 0.1414452009663958, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 2.690582036972046, + "learning_rate": 4.7108345534407023e-07, + "loss": 0.9084, + "mean_token_accuracy": 0.7290151119232178, + "num_tokens": 33220090.0, + "step": 1288 + }, + { + "epoch": 0.14155501866900944, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 2.244340658187866, + "learning_rate": 4.7144948755490483e-07, + "loss": 1.0261, + "mean_token_accuracy": 0.6905393004417419, + "num_tokens": 33247080.0, + "step": 1289 + }, + { + "epoch": 0.1416648363716231, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.1714000701904297, + "learning_rate": 4.718155197657394e-07, + "loss": 1.1235, + "mean_token_accuracy": 0.6779213547706604, + "num_tokens": 33276588.0, + "step": 1290 + }, + { + "epoch": 0.14177465407423676, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.7212839126586914, + "learning_rate": 4.7218155197657393e-07, + "loss": 0.9793, + "mean_token_accuracy": 0.7054566740989685, + "num_tokens": 33296291.0, + "step": 1291 + }, + { + "epoch": 0.14188447177685043, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.2918701171875, + "learning_rate": 4.725475841874085e-07, + "loss": 1.1299, + "mean_token_accuracy": 0.6675506234169006, + "num_tokens": 33327544.0, + "step": 1292 + }, + { + "epoch": 0.1419942894794641, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.082798957824707, + "learning_rate": 4.7291361639824303e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6917203664779663, + "num_tokens": 33360733.0, + "step": 1293 + }, + { + "epoch": 0.14210410718207775, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 2.280571937561035, + "learning_rate": 4.732796486090776e-07, + "loss": 1.0253, + "mean_token_accuracy": 0.698056161403656, + "num_tokens": 33386974.0, + "step": 1294 + }, + { + "epoch": 0.14221392488469142, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 2.5088632106781006, + "learning_rate": 4.736456808199121e-07, + "loss": 1.0539, + "mean_token_accuracy": 0.6982555389404297, + "num_tokens": 33410360.0, + "step": 1295 + }, + { + "epoch": 0.14232374258730507, + "ewc_loss": 3.1888484954833984e-06, + "grad_norm": 2.475541114807129, + "learning_rate": 4.740117130307467e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.687292218208313, + "num_tokens": 33435322.0, + "step": 1296 + }, + { + "epoch": 0.14243356028991874, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 2.3579628467559814, + "learning_rate": 4.743777452415813e-07, + "loss": 1.2145, + "mean_token_accuracy": 0.6524038314819336, + "num_tokens": 33464229.0, + "step": 1297 + }, + { + "epoch": 0.1425433779925324, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.565361261367798, + "learning_rate": 4.7474377745241577e-07, + "loss": 0.9607, + "mean_token_accuracy": 0.7178640365600586, + "num_tokens": 33485248.0, + "step": 1298 + }, + { + "epoch": 0.14265319569514606, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.250549077987671, + "learning_rate": 4.751098096632504e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6869034171104431, + "num_tokens": 33516065.0, + "step": 1299 + }, + { + "epoch": 0.1427630133977597, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.042527437210083, + "learning_rate": 4.754758418740849e-07, + "loss": 1.0052, + "mean_token_accuracy": 0.7061914205551147, + "num_tokens": 33548790.0, + "step": 1300 + }, + { + "epoch": 0.14287283110037338, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.271029233932495, + "learning_rate": 4.758418740849194e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6873657703399658, + "num_tokens": 33577167.0, + "step": 1301 + }, + { + "epoch": 0.14298264880298703, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.549752712249756, + "learning_rate": 4.76207906295754e-07, + "loss": 1.0465, + "mean_token_accuracy": 0.6974196434020996, + "num_tokens": 33599933.0, + "step": 1302 + }, + { + "epoch": 0.1430924665056007, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.707270622253418, + "learning_rate": 4.7657393850658857e-07, + "loss": 0.9602, + "mean_token_accuracy": 0.7089191675186157, + "num_tokens": 33618648.0, + "step": 1303 + }, + { + "epoch": 0.14320228420821438, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 2.330418348312378, + "learning_rate": 4.769399707174231e-07, + "loss": 1.0322, + "mean_token_accuracy": 0.6929222345352173, + "num_tokens": 33643320.0, + "step": 1304 + }, + { + "epoch": 0.14331210191082802, + "ewc_loss": 3.2335519790649414e-06, + "grad_norm": 2.4858293533325195, + "learning_rate": 4.773060029282577e-07, + "loss": 1.1176, + "mean_token_accuracy": 0.6688382029533386, + "num_tokens": 33667215.0, + "step": 1305 + }, + { + "epoch": 0.1434219196134417, + "ewc_loss": 3.2335519790649414e-06, + "grad_norm": 2.5017120838165283, + "learning_rate": 4.776720351390922e-07, + "loss": 1.1272, + "mean_token_accuracy": 0.6673105359077454, + "num_tokens": 33691314.0, + "step": 1306 + }, + { + "epoch": 0.14353173731605534, + "ewc_loss": 3.2335519790649414e-06, + "grad_norm": 2.513200283050537, + "learning_rate": 4.780380673499268e-07, + "loss": 1.0989, + "mean_token_accuracy": 0.6827288269996643, + "num_tokens": 33716231.0, + "step": 1307 + }, + { + "epoch": 0.14364155501866901, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.018425941467285, + "learning_rate": 4.784040995607613e-07, + "loss": 1.0744, + "mean_token_accuracy": 0.6807803511619568, + "num_tokens": 33750223.0, + "step": 1308 + }, + { + "epoch": 0.14375137272128266, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.55197811126709, + "learning_rate": 4.787701317715958e-07, + "loss": 1.0043, + "mean_token_accuracy": 0.7054019570350647, + "num_tokens": 33772320.0, + "step": 1309 + }, + { + "epoch": 0.14386119042389633, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.6128737926483154, + "learning_rate": 4.791361639824304e-07, + "loss": 1.0538, + "mean_token_accuracy": 0.6896075010299683, + "num_tokens": 33793342.0, + "step": 1310 + }, + { + "epoch": 0.14397100812650998, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.6582155227661133, + "learning_rate": 4.79502196193265e-07, + "loss": 0.9699, + "mean_token_accuracy": 0.7052695155143738, + "num_tokens": 33813186.0, + "step": 1311 + }, + { + "epoch": 0.14408082582912365, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3576505184173584, + "learning_rate": 4.798682284040995e-07, + "loss": 1.0191, + "mean_token_accuracy": 0.6908333897590637, + "num_tokens": 33840214.0, + "step": 1312 + }, + { + "epoch": 0.14419064353173733, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.59765887260437, + "learning_rate": 4.802342606149341e-07, + "loss": 1.0369, + "mean_token_accuracy": 0.6895540952682495, + "num_tokens": 33863647.0, + "step": 1313 + }, + { + "epoch": 0.14430046123435097, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.066143751144409, + "learning_rate": 4.806002928257686e-07, + "loss": 1.1094, + "mean_token_accuracy": 0.6779706478118896, + "num_tokens": 33896827.0, + "step": 1314 + }, + { + "epoch": 0.14441027893696465, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.470606803894043, + "learning_rate": 4.809663250366032e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.6803215742111206, + "num_tokens": 33920697.0, + "step": 1315 + }, + { + "epoch": 0.1445200966395783, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3331899642944336, + "learning_rate": 4.813323572474377e-07, + "loss": 1.0273, + "mean_token_accuracy": 0.69908607006073, + "num_tokens": 33945998.0, + "step": 1316 + }, + { + "epoch": 0.14462991434219197, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.5178446769714355, + "learning_rate": 4.816983894582723e-07, + "loss": 1.0464, + "mean_token_accuracy": 0.6963666677474976, + "num_tokens": 33969471.0, + "step": 1317 + }, + { + "epoch": 0.1447397320448056, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3434126377105713, + "learning_rate": 4.820644216691069e-07, + "loss": 1.08, + "mean_token_accuracy": 0.6784889101982117, + "num_tokens": 33995373.0, + "step": 1318 + }, + { + "epoch": 0.1448495497474193, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.662360429763794, + "learning_rate": 4.824304538799414e-07, + "loss": 0.9818, + "mean_token_accuracy": 0.7074779272079468, + "num_tokens": 34019470.0, + "step": 1319 + }, + { + "epoch": 0.14495936745003293, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3687469959259033, + "learning_rate": 4.827964860907759e-07, + "loss": 1.1017, + "mean_token_accuracy": 0.6777822971343994, + "num_tokens": 34046792.0, + "step": 1320 + }, + { + "epoch": 0.1450691851526466, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 2.3142096996307373, + "learning_rate": 4.831625183016105e-07, + "loss": 1.0794, + "mean_token_accuracy": 0.6796839237213135, + "num_tokens": 34073908.0, + "step": 1321 + }, + { + "epoch": 0.14517900285526028, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 2.1455183029174805, + "learning_rate": 4.835285505124451e-07, + "loss": 0.9408, + "mean_token_accuracy": 0.7145018577575684, + "num_tokens": 34101665.0, + "step": 1322 + }, + { + "epoch": 0.14528882055787393, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 2.4255943298339844, + "learning_rate": 4.838945827232796e-07, + "loss": 0.9672, + "mean_token_accuracy": 0.7097035050392151, + "num_tokens": 34123530.0, + "step": 1323 + }, + { + "epoch": 0.1453986382604876, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 2.4391884803771973, + "learning_rate": 4.842606149341142e-07, + "loss": 1.0598, + "mean_token_accuracy": 0.6763824224472046, + "num_tokens": 34148038.0, + "step": 1324 + }, + { + "epoch": 0.14550845596310125, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 2.4646997451782227, + "learning_rate": 4.846266471449487e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6817625761032104, + "num_tokens": 34171039.0, + "step": 1325 + }, + { + "epoch": 0.14561827366571492, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 2.3608295917510986, + "learning_rate": 4.849926793557833e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6823275089263916, + "num_tokens": 34197557.0, + "step": 1326 + }, + { + "epoch": 0.14572809136832857, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.499558210372925, + "learning_rate": 4.853587115666178e-07, + "loss": 1.006, + "mean_token_accuracy": 0.6961225271224976, + "num_tokens": 34219659.0, + "step": 1327 + }, + { + "epoch": 0.14583790907094224, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.7252016067504883, + "learning_rate": 4.857247437774524e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.68071049451828, + "num_tokens": 34241179.0, + "step": 1328 + }, + { + "epoch": 0.14594772677355589, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.6187050342559814, + "learning_rate": 4.86090775988287e-07, + "loss": 0.951, + "mean_token_accuracy": 0.715721845626831, + "num_tokens": 34261000.0, + "step": 1329 + }, + { + "epoch": 0.14605754447616956, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.5401611328125, + "learning_rate": 4.864568081991215e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6842934489250183, + "num_tokens": 34285817.0, + "step": 1330 + }, + { + "epoch": 0.14616736217878323, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 2.3174326419830322, + "learning_rate": 4.868228404099561e-07, + "loss": 1.1421, + "mean_token_accuracy": 0.6651726961135864, + "num_tokens": 34315537.0, + "step": 1331 + }, + { + "epoch": 0.14627717988139688, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.4091970920562744, + "learning_rate": 4.871888726207906e-07, + "loss": 1.065, + "mean_token_accuracy": 0.6844967603683472, + "num_tokens": 34338741.0, + "step": 1332 + }, + { + "epoch": 0.14638699758401055, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.4743545055389404, + "learning_rate": 4.875549048316251e-07, + "loss": 1.0453, + "mean_token_accuracy": 0.6915890574455261, + "num_tokens": 34362546.0, + "step": 1333 + }, + { + "epoch": 0.1464968152866242, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.0882041454315186, + "learning_rate": 4.879209370424597e-07, + "loss": 1.137, + "mean_token_accuracy": 0.6648823022842407, + "num_tokens": 34394885.0, + "step": 1334 + }, + { + "epoch": 0.14660663298923787, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.1149749755859375, + "learning_rate": 4.882869692532943e-07, + "loss": 1.0865, + "mean_token_accuracy": 0.6735713481903076, + "num_tokens": 34425299.0, + "step": 1335 + }, + { + "epoch": 0.14671645069185152, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.4397192001342773, + "learning_rate": 4.886530014641288e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.6914026141166687, + "num_tokens": 34448680.0, + "step": 1336 + }, + { + "epoch": 0.1468262683944652, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.3290939331054688, + "learning_rate": 4.890190336749634e-07, + "loss": 1.0939, + "mean_token_accuracy": 0.6814024448394775, + "num_tokens": 34474831.0, + "step": 1337 + }, + { + "epoch": 0.14693608609707884, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.399338960647583, + "learning_rate": 4.893850658857979e-07, + "loss": 0.927, + "mean_token_accuracy": 0.7197632789611816, + "num_tokens": 34496472.0, + "step": 1338 + }, + { + "epoch": 0.1470459037996925, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 2.513505220413208, + "learning_rate": 4.897510980966325e-07, + "loss": 0.9959, + "mean_token_accuracy": 0.713485062122345, + "num_tokens": 34519372.0, + "step": 1339 + }, + { + "epoch": 0.14715572150230616, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.3493359088897705, + "learning_rate": 4.90117130307467e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.6913020610809326, + "num_tokens": 34545661.0, + "step": 1340 + }, + { + "epoch": 0.14726553920491983, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.2264764308929443, + "learning_rate": 4.904831625183016e-07, + "loss": 1.172, + "mean_token_accuracy": 0.6626344323158264, + "num_tokens": 34576126.0, + "step": 1341 + }, + { + "epoch": 0.1473753569075335, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.5943422317504883, + "learning_rate": 4.908491947291362e-07, + "loss": 1.0932, + "mean_token_accuracy": 0.680105984210968, + "num_tokens": 34600086.0, + "step": 1342 + }, + { + "epoch": 0.14748517461014715, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.338102340698242, + "learning_rate": 4.912152269399707e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.7047562003135681, + "num_tokens": 34626401.0, + "step": 1343 + }, + { + "epoch": 0.14759499231276083, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.3874800205230713, + "learning_rate": 4.915812591508052e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6757742166519165, + "num_tokens": 34653201.0, + "step": 1344 + }, + { + "epoch": 0.14770481001537447, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.3318874835968018, + "learning_rate": 4.919472913616398e-07, + "loss": 1.0833, + "mean_token_accuracy": 0.6785346865653992, + "num_tokens": 34680307.0, + "step": 1345 + }, + { + "epoch": 0.14781462771798815, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.547093629837036, + "learning_rate": 4.923133235724744e-07, + "loss": 1.0399, + "mean_token_accuracy": 0.6957828998565674, + "num_tokens": 34702531.0, + "step": 1346 + }, + { + "epoch": 0.1479244454206018, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.125781297683716, + "learning_rate": 4.926793557833089e-07, + "loss": 1.0692, + "mean_token_accuracy": 0.6838468909263611, + "num_tokens": 34733708.0, + "step": 1347 + }, + { + "epoch": 0.14803426312321546, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.4788105487823486, + "learning_rate": 4.930453879941435e-07, + "loss": 1.0993, + "mean_token_accuracy": 0.6769970059394836, + "num_tokens": 34758625.0, + "step": 1348 + }, + { + "epoch": 0.1481440808258291, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.260887622833252, + "learning_rate": 4.93411420204978e-07, + "loss": 1.0636, + "mean_token_accuracy": 0.6941145062446594, + "num_tokens": 34786952.0, + "step": 1349 + }, + { + "epoch": 0.14825389852844278, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.316326856613159, + "learning_rate": 4.937774524158126e-07, + "loss": 1.124, + "mean_token_accuracy": 0.6717540621757507, + "num_tokens": 34813181.0, + "step": 1350 + }, + { + "epoch": 0.14836371623105646, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.197755813598633, + "learning_rate": 4.941434846266471e-07, + "loss": 1.0184, + "mean_token_accuracy": 0.6942656636238098, + "num_tokens": 34845742.0, + "step": 1351 + }, + { + "epoch": 0.1484735339336701, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.2643351554870605, + "learning_rate": 4.945095168374817e-07, + "loss": 1.0636, + "mean_token_accuracy": 0.6843202114105225, + "num_tokens": 34872515.0, + "step": 1352 + }, + { + "epoch": 0.14858335163628378, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.396113157272339, + "learning_rate": 4.948755490483163e-07, + "loss": 1.1201, + "mean_token_accuracy": 0.6676150560379028, + "num_tokens": 34899778.0, + "step": 1353 + }, + { + "epoch": 0.14869316933889742, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.309542417526245, + "learning_rate": 4.952415812591508e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.6940574645996094, + "num_tokens": 34926103.0, + "step": 1354 + }, + { + "epoch": 0.1488029870415111, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.2097175121307373, + "learning_rate": 4.956076134699854e-07, + "loss": 0.9741, + "mean_token_accuracy": 0.7065348625183105, + "num_tokens": 34952471.0, + "step": 1355 + }, + { + "epoch": 0.14891280474412474, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 2.145569086074829, + "learning_rate": 4.959736456808199e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.6704615354537964, + "num_tokens": 34981653.0, + "step": 1356 + }, + { + "epoch": 0.14902262244673842, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.514423131942749, + "learning_rate": 4.963396778916544e-07, + "loss": 0.9884, + "mean_token_accuracy": 0.7093942761421204, + "num_tokens": 35004160.0, + "step": 1357 + }, + { + "epoch": 0.14913244014935206, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.219200372695923, + "learning_rate": 4.96705710102489e-07, + "loss": 1.0849, + "mean_token_accuracy": 0.6846756935119629, + "num_tokens": 35033695.0, + "step": 1358 + }, + { + "epoch": 0.14924225785196574, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.346745729446411, + "learning_rate": 4.970717423133236e-07, + "loss": 1.0161, + "mean_token_accuracy": 0.6910429000854492, + "num_tokens": 35058960.0, + "step": 1359 + }, + { + "epoch": 0.1493520755545794, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.3643317222595215, + "learning_rate": 4.974377745241581e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6925081014633179, + "num_tokens": 35082458.0, + "step": 1360 + }, + { + "epoch": 0.14946189325719306, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.2195937633514404, + "learning_rate": 4.978038067349927e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.690503716468811, + "num_tokens": 35111320.0, + "step": 1361 + }, + { + "epoch": 0.14957171095980673, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.5440080165863037, + "learning_rate": 4.981698389458272e-07, + "loss": 1.1067, + "mean_token_accuracy": 0.668626070022583, + "num_tokens": 35136034.0, + "step": 1362 + }, + { + "epoch": 0.14968152866242038, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.4478659629821777, + "learning_rate": 4.985358711566618e-07, + "loss": 1.05, + "mean_token_accuracy": 0.690972089767456, + "num_tokens": 35160498.0, + "step": 1363 + }, + { + "epoch": 0.14979134636503405, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.1334452629089355, + "learning_rate": 4.989019033674963e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6837239265441895, + "num_tokens": 35191074.0, + "step": 1364 + }, + { + "epoch": 0.1499011640676477, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.3900907039642334, + "learning_rate": 4.992679355783309e-07, + "loss": 1.0587, + "mean_token_accuracy": 0.6913729906082153, + "num_tokens": 35216079.0, + "step": 1365 + }, + { + "epoch": 0.15001098177026137, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 2.5205252170562744, + "learning_rate": 4.996339677891655e-07, + "loss": 1.0622, + "mean_token_accuracy": 0.6931835412979126, + "num_tokens": 35239149.0, + "step": 1366 + }, + { + "epoch": 0.15012079947287502, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.267179250717163, + "learning_rate": 5e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6792996525764465, + "num_tokens": 35267266.0, + "step": 1367 + }, + { + "epoch": 0.1502306171754887, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.2695391178131104, + "learning_rate": 5.003660322108345e-07, + "loss": 1.1178, + "mean_token_accuracy": 0.6748911738395691, + "num_tokens": 35295774.0, + "step": 1368 + }, + { + "epoch": 0.15034043487810236, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.734623670578003, + "learning_rate": 5.007320644216691e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.6878563165664673, + "num_tokens": 35315866.0, + "step": 1369 + }, + { + "epoch": 0.150450252580716, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.4189651012420654, + "learning_rate": 5.010980966325037e-07, + "loss": 1.08, + "mean_token_accuracy": 0.6869282722473145, + "num_tokens": 35340461.0, + "step": 1370 + }, + { + "epoch": 0.15056007028332968, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.158029079437256, + "learning_rate": 5.014641288433382e-07, + "loss": 0.9908, + "mean_token_accuracy": 0.7031317353248596, + "num_tokens": 35367551.0, + "step": 1371 + }, + { + "epoch": 0.15066988798594333, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.2132554054260254, + "learning_rate": 5.018301610541728e-07, + "loss": 1.1208, + "mean_token_accuracy": 0.6655483841896057, + "num_tokens": 35397737.0, + "step": 1372 + }, + { + "epoch": 0.150779705688557, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.5164685249328613, + "learning_rate": 5.021961932650073e-07, + "loss": 1.0369, + "mean_token_accuracy": 0.6938371658325195, + "num_tokens": 35422047.0, + "step": 1373 + }, + { + "epoch": 0.15088952339117065, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.256465435028076, + "learning_rate": 5.025622254758418e-07, + "loss": 1.0583, + "mean_token_accuracy": 0.6879005432128906, + "num_tokens": 35450974.0, + "step": 1374 + }, + { + "epoch": 0.15099934109378432, + "ewc_loss": 3.382563591003418e-06, + "grad_norm": 2.277759075164795, + "learning_rate": 5.029282576866764e-07, + "loss": 0.9939, + "mean_token_accuracy": 0.7044919729232788, + "num_tokens": 35478196.0, + "step": 1375 + }, + { + "epoch": 0.15110915879639797, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 2.161494016647339, + "learning_rate": 5.03294289897511e-07, + "loss": 1.1224, + "mean_token_accuracy": 0.6688538193702698, + "num_tokens": 35510077.0, + "step": 1376 + }, + { + "epoch": 0.15121897649901164, + "ewc_loss": 3.4123659133911133e-06, + "grad_norm": 2.361820936203003, + "learning_rate": 5.036603221083456e-07, + "loss": 1.0706, + "mean_token_accuracy": 0.6847456693649292, + "num_tokens": 35538238.0, + "step": 1377 + }, + { + "epoch": 0.1513287942016253, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.2089321613311768, + "learning_rate": 5.040263543191801e-07, + "loss": 1.1177, + "mean_token_accuracy": 0.6682100296020508, + "num_tokens": 35569282.0, + "step": 1378 + }, + { + "epoch": 0.15143861190423896, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.4204812049865723, + "learning_rate": 5.043923865300146e-07, + "loss": 1.1601, + "mean_token_accuracy": 0.671600341796875, + "num_tokens": 35595162.0, + "step": 1379 + }, + { + "epoch": 0.15154842960685264, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 2.413783311843872, + "learning_rate": 5.047584187408492e-07, + "loss": 1.0022, + "mean_token_accuracy": 0.6976226568222046, + "num_tokens": 35620173.0, + "step": 1380 + }, + { + "epoch": 0.15165824730946628, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 2.759983777999878, + "learning_rate": 5.051244509516838e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7044582366943359, + "num_tokens": 35641699.0, + "step": 1381 + }, + { + "epoch": 0.15176806501207996, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.258857250213623, + "learning_rate": 5.054904831625183e-07, + "loss": 1.1021, + "mean_token_accuracy": 0.6736060380935669, + "num_tokens": 35672048.0, + "step": 1382 + }, + { + "epoch": 0.1518778827146936, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.425708532333374, + "learning_rate": 5.058565153733529e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6775069236755371, + "num_tokens": 35697483.0, + "step": 1383 + }, + { + "epoch": 0.15198770041730728, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.361309766769409, + "learning_rate": 5.062225475841874e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6965068578720093, + "num_tokens": 35723408.0, + "step": 1384 + }, + { + "epoch": 0.15209751811992092, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.375441789627075, + "learning_rate": 5.065885797950219e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.6920702457427979, + "num_tokens": 35747189.0, + "step": 1385 + }, + { + "epoch": 0.1522073358225346, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 2.2212674617767334, + "learning_rate": 5.069546120058566e-07, + "loss": 1.0669, + "mean_token_accuracy": 0.6823331713676453, + "num_tokens": 35777084.0, + "step": 1386 + }, + { + "epoch": 0.15231715352514824, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.403362274169922, + "learning_rate": 5.073206442166911e-07, + "loss": 0.9803, + "mean_token_accuracy": 0.7079250812530518, + "num_tokens": 35801318.0, + "step": 1387 + }, + { + "epoch": 0.15242697122776191, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.8376035690307617, + "learning_rate": 5.076866764275256e-07, + "loss": 1.0, + "mean_token_accuracy": 0.7028485536575317, + "num_tokens": 35820852.0, + "step": 1388 + }, + { + "epoch": 0.1525367889303756, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.395885944366455, + "learning_rate": 5.080527086383602e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6764516830444336, + "num_tokens": 35846653.0, + "step": 1389 + }, + { + "epoch": 0.15264660663298923, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 2.1434226036071777, + "learning_rate": 5.084187408491947e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.675629734992981, + "num_tokens": 35877674.0, + "step": 1390 + }, + { + "epoch": 0.1527564243356029, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 2.143498182296753, + "learning_rate": 5.087847730600292e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.6802003383636475, + "num_tokens": 35908450.0, + "step": 1391 + }, + { + "epoch": 0.15286624203821655, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 2.385495662689209, + "learning_rate": 5.091508052708639e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6922053098678589, + "num_tokens": 35934858.0, + "step": 1392 + }, + { + "epoch": 0.15297605974083023, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.3456203937530518, + "learning_rate": 5.095168374816984e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.6854041814804077, + "num_tokens": 35962276.0, + "step": 1393 + }, + { + "epoch": 0.15308587744344387, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.610382556915283, + "learning_rate": 5.09882869692533e-07, + "loss": 1.1808, + "mean_token_accuracy": 0.66227126121521, + "num_tokens": 35987873.0, + "step": 1394 + }, + { + "epoch": 0.15319569514605755, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.4043335914611816, + "learning_rate": 5.102489019033675e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6861127614974976, + "num_tokens": 36013354.0, + "step": 1395 + }, + { + "epoch": 0.1533055128486712, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.2554595470428467, + "learning_rate": 5.10614934114202e-07, + "loss": 1.0712, + "mean_token_accuracy": 0.6820650100708008, + "num_tokens": 36040211.0, + "step": 1396 + }, + { + "epoch": 0.15341533055128487, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.3388123512268066, + "learning_rate": 5.109809663250367e-07, + "loss": 1.0989, + "mean_token_accuracy": 0.6762032508850098, + "num_tokens": 36066374.0, + "step": 1397 + }, + { + "epoch": 0.15352514825389854, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 2.405137777328491, + "learning_rate": 5.113469985358712e-07, + "loss": 0.9978, + "mean_token_accuracy": 0.6995058059692383, + "num_tokens": 36092114.0, + "step": 1398 + }, + { + "epoch": 0.1536349659565122, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 2.6273951530456543, + "learning_rate": 5.117130307467057e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.6890732645988464, + "num_tokens": 36117150.0, + "step": 1399 + }, + { + "epoch": 0.15374478365912586, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 2.2091190814971924, + "learning_rate": 5.120790629575403e-07, + "loss": 1.1538, + "mean_token_accuracy": 0.666439950466156, + "num_tokens": 36146676.0, + "step": 1400 + }, + { + "epoch": 0.1538546013617395, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 2.3724889755249023, + "learning_rate": 5.124450951683748e-07, + "loss": 1.1084, + "mean_token_accuracy": 0.6725355982780457, + "num_tokens": 36173244.0, + "step": 1401 + }, + { + "epoch": 0.15396441906435318, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 2.214016914367676, + "learning_rate": 5.128111273792094e-07, + "loss": 1.1103, + "mean_token_accuracy": 0.677545428276062, + "num_tokens": 36203197.0, + "step": 1402 + }, + { + "epoch": 0.15407423676696683, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 2.338026762008667, + "learning_rate": 5.13177159590044e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6727278828620911, + "num_tokens": 36231276.0, + "step": 1403 + }, + { + "epoch": 0.1541840544695805, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 2.1658668518066406, + "learning_rate": 5.135431918008785e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.6905209422111511, + "num_tokens": 36261923.0, + "step": 1404 + }, + { + "epoch": 0.15429387217219415, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 2.4478445053100586, + "learning_rate": 5.13909224011713e-07, + "loss": 0.99, + "mean_token_accuracy": 0.710797131061554, + "num_tokens": 36285200.0, + "step": 1405 + }, + { + "epoch": 0.15440368987480782, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 2.342209577560425, + "learning_rate": 5.142752562225476e-07, + "loss": 1.0774, + "mean_token_accuracy": 0.6803367137908936, + "num_tokens": 36310732.0, + "step": 1406 + }, + { + "epoch": 0.1545135075774215, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 2.5287320613861084, + "learning_rate": 5.14641288433382e-07, + "loss": 1.0053, + "mean_token_accuracy": 0.7005858421325684, + "num_tokens": 36333833.0, + "step": 1407 + }, + { + "epoch": 0.15462332528003514, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.4627676010131836, + "learning_rate": 5.150073206442168e-07, + "loss": 1.0154, + "mean_token_accuracy": 0.7042307257652283, + "num_tokens": 36357548.0, + "step": 1408 + }, + { + "epoch": 0.1547331429826488, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.224602222442627, + "learning_rate": 5.153733528550513e-07, + "loss": 0.9734, + "mean_token_accuracy": 0.7066992521286011, + "num_tokens": 36386854.0, + "step": 1409 + }, + { + "epoch": 0.15484296068526246, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.0661935806274414, + "learning_rate": 5.157393850658857e-07, + "loss": 1.168, + "mean_token_accuracy": 0.6616121530532837, + "num_tokens": 36425311.0, + "step": 1410 + }, + { + "epoch": 0.15495277838787613, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.125882148742676, + "learning_rate": 5.161054172767203e-07, + "loss": 0.9837, + "mean_token_accuracy": 0.702193558216095, + "num_tokens": 36453543.0, + "step": 1411 + }, + { + "epoch": 0.15506259609048978, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.431307554244995, + "learning_rate": 5.164714494875548e-07, + "loss": 1.0936, + "mean_token_accuracy": 0.6877700686454773, + "num_tokens": 36479897.0, + "step": 1412 + }, + { + "epoch": 0.15517241379310345, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.398103952407837, + "learning_rate": 5.168374816983894e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6816451549530029, + "num_tokens": 36508596.0, + "step": 1413 + }, + { + "epoch": 0.1552822314957171, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.5406737327575684, + "learning_rate": 5.17203513909224e-07, + "loss": 1.0826, + "mean_token_accuracy": 0.6781405806541443, + "num_tokens": 36532995.0, + "step": 1414 + }, + { + "epoch": 0.15539204919833077, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.1425516605377197, + "learning_rate": 5.175695461200585e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.6790496110916138, + "num_tokens": 36565082.0, + "step": 1415 + }, + { + "epoch": 0.15550186690094442, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 2.3426718711853027, + "learning_rate": 5.17935578330893e-07, + "loss": 1.1697, + "mean_token_accuracy": 0.655335009098053, + "num_tokens": 36595531.0, + "step": 1416 + }, + { + "epoch": 0.1556116846035581, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.470357894897461, + "learning_rate": 5.183016105417276e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.6984805464744568, + "num_tokens": 36618337.0, + "step": 1417 + }, + { + "epoch": 0.15572150230617177, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.60383939743042, + "learning_rate": 5.186676427525622e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.6685613393783569, + "num_tokens": 36641123.0, + "step": 1418 + }, + { + "epoch": 0.1558313200087854, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.38930606842041, + "learning_rate": 5.190336749633967e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6904737949371338, + "num_tokens": 36667870.0, + "step": 1419 + }, + { + "epoch": 0.15594113771139909, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.273273468017578, + "learning_rate": 5.193997071742313e-07, + "loss": 1.0561, + "mean_token_accuracy": 0.6897518038749695, + "num_tokens": 36697458.0, + "step": 1420 + }, + { + "epoch": 0.15605095541401273, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.1989660263061523, + "learning_rate": 5.197657393850658e-07, + "loss": 1.1096, + "mean_token_accuracy": 0.6777681708335876, + "num_tokens": 36728999.0, + "step": 1421 + }, + { + "epoch": 0.1561607731166264, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.848385810852051, + "learning_rate": 5.201317715959003e-07, + "loss": 1.1467, + "mean_token_accuracy": 0.6672494411468506, + "num_tokens": 36751360.0, + "step": 1422 + }, + { + "epoch": 0.15627059081924005, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.3902809619903564, + "learning_rate": 5.204978038067349e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6809993982315063, + "num_tokens": 36778609.0, + "step": 1423 + }, + { + "epoch": 0.15638040852185373, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.485682725906372, + "learning_rate": 5.208638360175695e-07, + "loss": 1.0263, + "mean_token_accuracy": 0.6926228404045105, + "num_tokens": 36801124.0, + "step": 1424 + }, + { + "epoch": 0.15649022622446737, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 2.4961647987365723, + "learning_rate": 5.212298682284041e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6719284057617188, + "num_tokens": 36826030.0, + "step": 1425 + }, + { + "epoch": 0.15660004392708105, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.248225212097168, + "learning_rate": 5.215959004392386e-07, + "loss": 1.1306, + "mean_token_accuracy": 0.6696171760559082, + "num_tokens": 36857373.0, + "step": 1426 + }, + { + "epoch": 0.15670986162969472, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.4038357734680176, + "learning_rate": 5.219619326500731e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.6962092518806458, + "num_tokens": 36882743.0, + "step": 1427 + }, + { + "epoch": 0.15681967933230836, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.3254804611206055, + "learning_rate": 5.223279648609077e-07, + "loss": 1.045, + "mean_token_accuracy": 0.689921498298645, + "num_tokens": 36908317.0, + "step": 1428 + }, + { + "epoch": 0.15692949703492204, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.5264153480529785, + "learning_rate": 5.226939970717423e-07, + "loss": 1.0774, + "mean_token_accuracy": 0.6793670654296875, + "num_tokens": 36933934.0, + "step": 1429 + }, + { + "epoch": 0.15703931473753568, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.3912410736083984, + "learning_rate": 5.230600292825768e-07, + "loss": 1.0154, + "mean_token_accuracy": 0.6967182755470276, + "num_tokens": 36961658.0, + "step": 1430 + }, + { + "epoch": 0.15714913244014936, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 4.1317524909973145, + "learning_rate": 5.234260614934114e-07, + "loss": 1.0444, + "mean_token_accuracy": 0.6846979856491089, + "num_tokens": 36989890.0, + "step": 1431 + }, + { + "epoch": 0.157258950142763, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.6179540157318115, + "learning_rate": 5.237920937042459e-07, + "loss": 1.1369, + "mean_token_accuracy": 0.6706365346908569, + "num_tokens": 37014359.0, + "step": 1432 + }, + { + "epoch": 0.15736876784537668, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.545365810394287, + "learning_rate": 5.241581259150804e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6923038959503174, + "num_tokens": 37036466.0, + "step": 1433 + }, + { + "epoch": 0.15747858554799032, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.691986083984375, + "learning_rate": 5.245241581259151e-07, + "loss": 0.9896, + "mean_token_accuracy": 0.7052090167999268, + "num_tokens": 37056557.0, + "step": 1434 + }, + { + "epoch": 0.157588403250604, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.4878554344177246, + "learning_rate": 5.248901903367496e-07, + "loss": 1.1046, + "mean_token_accuracy": 0.6724478006362915, + "num_tokens": 37079546.0, + "step": 1435 + }, + { + "epoch": 0.15769822095321767, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.591970920562744, + "learning_rate": 5.252562225475841e-07, + "loss": 1.131, + "mean_token_accuracy": 0.6696991920471191, + "num_tokens": 37103143.0, + "step": 1436 + }, + { + "epoch": 0.15780803865583132, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.382094383239746, + "learning_rate": 5.256222547584187e-07, + "loss": 0.9959, + "mean_token_accuracy": 0.7200796604156494, + "num_tokens": 37128958.0, + "step": 1437 + }, + { + "epoch": 0.157917856358445, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.697359323501587, + "learning_rate": 5.259882869692532e-07, + "loss": 0.9899, + "mean_token_accuracy": 0.7067126631736755, + "num_tokens": 37148862.0, + "step": 1438 + }, + { + "epoch": 0.15802767406105864, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 2.6961545944213867, + "learning_rate": 5.263543191800877e-07, + "loss": 0.9493, + "mean_token_accuracy": 0.7122265100479126, + "num_tokens": 37168500.0, + "step": 1439 + }, + { + "epoch": 0.1581374917636723, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.5838046073913574, + "learning_rate": 5.267203513909224e-07, + "loss": 1.0742, + "mean_token_accuracy": 0.6798703670501709, + "num_tokens": 37191202.0, + "step": 1440 + }, + { + "epoch": 0.15824730946628596, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.4913666248321533, + "learning_rate": 5.270863836017569e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.6630654335021973, + "num_tokens": 37215939.0, + "step": 1441 + }, + { + "epoch": 0.15835712716889963, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.2521584033966064, + "learning_rate": 5.274524158125915e-07, + "loss": 1.2476, + "mean_token_accuracy": 0.6606035232543945, + "num_tokens": 37246502.0, + "step": 1442 + }, + { + "epoch": 0.15846694487151328, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.496626377105713, + "learning_rate": 5.27818448023426e-07, + "loss": 0.953, + "mean_token_accuracy": 0.7143974304199219, + "num_tokens": 37269519.0, + "step": 1443 + }, + { + "epoch": 0.15857676257412695, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.2488269805908203, + "learning_rate": 5.281844802342605e-07, + "loss": 1.0219, + "mean_token_accuracy": 0.6999832391738892, + "num_tokens": 37295472.0, + "step": 1444 + }, + { + "epoch": 0.15868658027674062, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.3422963619232178, + "learning_rate": 5.285505124450952e-07, + "loss": 1.1014, + "mean_token_accuracy": 0.678535521030426, + "num_tokens": 37323089.0, + "step": 1445 + }, + { + "epoch": 0.15879639797935427, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.5792980194091797, + "learning_rate": 5.289165446559297e-07, + "loss": 1.109, + "mean_token_accuracy": 0.6697349548339844, + "num_tokens": 37349083.0, + "step": 1446 + }, + { + "epoch": 0.15890621568196794, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.6141018867492676, + "learning_rate": 5.292825768667642e-07, + "loss": 0.9237, + "mean_token_accuracy": 0.7187613248825073, + "num_tokens": 37370736.0, + "step": 1447 + }, + { + "epoch": 0.1590160333845816, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.4885354042053223, + "learning_rate": 5.296486090775988e-07, + "loss": 1.085, + "mean_token_accuracy": 0.6806784272193909, + "num_tokens": 37395554.0, + "step": 1448 + }, + { + "epoch": 0.15912585108719526, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 2.489989995956421, + "learning_rate": 5.300146412884333e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.6873341798782349, + "num_tokens": 37418863.0, + "step": 1449 + }, + { + "epoch": 0.1592356687898089, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 3.0221574306488037, + "learning_rate": 5.303806734992679e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.6833710670471191, + "num_tokens": 37435803.0, + "step": 1450 + }, + { + "epoch": 0.15934548649242258, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.395343542098999, + "learning_rate": 5.307467057101025e-07, + "loss": 1.1595, + "mean_token_accuracy": 0.657595157623291, + "num_tokens": 37465254.0, + "step": 1451 + }, + { + "epoch": 0.15945530419503623, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.3393609523773193, + "learning_rate": 5.31112737920937e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7069559097290039, + "num_tokens": 37490392.0, + "step": 1452 + }, + { + "epoch": 0.1595651218976499, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.362633466720581, + "learning_rate": 5.314787701317715e-07, + "loss": 1.054, + "mean_token_accuracy": 0.6858939528465271, + "num_tokens": 37515868.0, + "step": 1453 + }, + { + "epoch": 0.15967493960026355, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.4846584796905518, + "learning_rate": 5.318448023426061e-07, + "loss": 0.9995, + "mean_token_accuracy": 0.7036100625991821, + "num_tokens": 37539607.0, + "step": 1454 + }, + { + "epoch": 0.15978475730287722, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.941898822784424, + "learning_rate": 5.322108345534406e-07, + "loss": 0.9313, + "mean_token_accuracy": 0.7162513136863708, + "num_tokens": 37557033.0, + "step": 1455 + }, + { + "epoch": 0.1598945750054909, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 2.8975133895874023, + "learning_rate": 5.325768667642752e-07, + "loss": 0.9889, + "mean_token_accuracy": 0.6999136209487915, + "num_tokens": 37581118.0, + "step": 1456 + }, + { + "epoch": 0.16000439270810454, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.412320613861084, + "learning_rate": 5.329428989751098e-07, + "loss": 1.141, + "mean_token_accuracy": 0.6671912670135498, + "num_tokens": 37609075.0, + "step": 1457 + }, + { + "epoch": 0.16011421041071822, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.268573045730591, + "learning_rate": 5.333089311859443e-07, + "loss": 1.0501, + "mean_token_accuracy": 0.6896424293518066, + "num_tokens": 37636878.0, + "step": 1458 + }, + { + "epoch": 0.16022402811333186, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.298672676086426, + "learning_rate": 5.336749633967789e-07, + "loss": 1.1086, + "mean_token_accuracy": 0.6675886511802673, + "num_tokens": 37667267.0, + "step": 1459 + }, + { + "epoch": 0.16033384581594554, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.712271213531494, + "learning_rate": 5.340409956076134e-07, + "loss": 1.0168, + "mean_token_accuracy": 0.7000830173492432, + "num_tokens": 37688240.0, + "step": 1460 + }, + { + "epoch": 0.16044366351855918, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.6407155990600586, + "learning_rate": 5.34407027818448e-07, + "loss": 0.9376, + "mean_token_accuracy": 0.7132566571235657, + "num_tokens": 37707416.0, + "step": 1461 + }, + { + "epoch": 0.16055348122117286, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.093210220336914, + "learning_rate": 5.347730600292826e-07, + "loss": 1.087, + "mean_token_accuracy": 0.6833791732788086, + "num_tokens": 37740039.0, + "step": 1462 + }, + { + "epoch": 0.1606632989237865, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.1227121353149414, + "learning_rate": 5.351390922401171e-07, + "loss": 1.1451, + "mean_token_accuracy": 0.6688340902328491, + "num_tokens": 37773476.0, + "step": 1463 + }, + { + "epoch": 0.16077311662640018, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.295560598373413, + "learning_rate": 5.355051244509516e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6900171041488647, + "num_tokens": 37802625.0, + "step": 1464 + }, + { + "epoch": 0.16088293432901385, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.848593235015869, + "learning_rate": 5.358711566617862e-07, + "loss": 0.9715, + "mean_token_accuracy": 0.7086926698684692, + "num_tokens": 37822336.0, + "step": 1465 + }, + { + "epoch": 0.1609927520316275, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.4239344596862793, + "learning_rate": 5.362371888726208e-07, + "loss": 1.0066, + "mean_token_accuracy": 0.6978536248207092, + "num_tokens": 37845801.0, + "step": 1466 + }, + { + "epoch": 0.16110256973424117, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.4431400299072266, + "learning_rate": 5.366032210834553e-07, + "loss": 1.1174, + "mean_token_accuracy": 0.676944375038147, + "num_tokens": 37872498.0, + "step": 1467 + }, + { + "epoch": 0.16121238743685481, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.092496871948242, + "learning_rate": 5.369692532942899e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.686896800994873, + "num_tokens": 37904152.0, + "step": 1468 + }, + { + "epoch": 0.1613222051394685, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.2124476432800293, + "learning_rate": 5.373352855051244e-07, + "loss": 0.9837, + "mean_token_accuracy": 0.7111308574676514, + "num_tokens": 37931150.0, + "step": 1469 + }, + { + "epoch": 0.16143202284208213, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.440429210662842, + "learning_rate": 5.377013177159589e-07, + "loss": 1.0582, + "mean_token_accuracy": 0.6838119029998779, + "num_tokens": 37955011.0, + "step": 1470 + }, + { + "epoch": 0.1615418405446958, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.4625065326690674, + "learning_rate": 5.380673499267935e-07, + "loss": 1.124, + "mean_token_accuracy": 0.6712194681167603, + "num_tokens": 37981518.0, + "step": 1471 + }, + { + "epoch": 0.16165165824730945, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 2.376608371734619, + "learning_rate": 5.384333821376281e-07, + "loss": 1.091, + "mean_token_accuracy": 0.6811525821685791, + "num_tokens": 38008938.0, + "step": 1472 + }, + { + "epoch": 0.16176147594992313, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 2.3925132751464844, + "learning_rate": 5.387994143484626e-07, + "loss": 1.0485, + "mean_token_accuracy": 0.6861202716827393, + "num_tokens": 38036626.0, + "step": 1473 + }, + { + "epoch": 0.1618712936525368, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.077587604522705, + "learning_rate": 5.391654465592972e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.6734978556632996, + "num_tokens": 38070141.0, + "step": 1474 + }, + { + "epoch": 0.16198111135515045, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.704803228378296, + "learning_rate": 5.395314787701317e-07, + "loss": 1.0236, + "mean_token_accuracy": 0.6931037306785583, + "num_tokens": 38092134.0, + "step": 1475 + }, + { + "epoch": 0.16209092905776412, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.826103448867798, + "learning_rate": 5.398975109809663e-07, + "loss": 0.9876, + "mean_token_accuracy": 0.7004561424255371, + "num_tokens": 38111775.0, + "step": 1476 + }, + { + "epoch": 0.16220074676037777, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.7133524417877197, + "learning_rate": 5.402635431918009e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.676531970500946, + "num_tokens": 38134366.0, + "step": 1477 + }, + { + "epoch": 0.16231056446299144, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.4209177494049072, + "learning_rate": 5.406295754026354e-07, + "loss": 1.0871, + "mean_token_accuracy": 0.676588237285614, + "num_tokens": 38159731.0, + "step": 1478 + }, + { + "epoch": 0.1624203821656051, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.31461763381958, + "learning_rate": 5.4099560761347e-07, + "loss": 1.0645, + "mean_token_accuracy": 0.6802313923835754, + "num_tokens": 38189414.0, + "step": 1479 + }, + { + "epoch": 0.16253019986821876, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 2.4426188468933105, + "learning_rate": 5.413616398243045e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.691259503364563, + "num_tokens": 38213059.0, + "step": 1480 + }, + { + "epoch": 0.1626400175708324, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.416884183883667, + "learning_rate": 5.41727672035139e-07, + "loss": 1.1227, + "mean_token_accuracy": 0.6669793128967285, + "num_tokens": 38242367.0, + "step": 1481 + }, + { + "epoch": 0.16274983527344608, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.3135132789611816, + "learning_rate": 5.420937042459737e-07, + "loss": 1.1382, + "mean_token_accuracy": 0.6623738408088684, + "num_tokens": 38269270.0, + "step": 1482 + }, + { + "epoch": 0.16285965297605975, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.342783212661743, + "learning_rate": 5.424597364568082e-07, + "loss": 1.1015, + "mean_token_accuracy": 0.6869087219238281, + "num_tokens": 38297171.0, + "step": 1483 + }, + { + "epoch": 0.1629694706786734, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.6733412742614746, + "learning_rate": 5.428257686676427e-07, + "loss": 1.1008, + "mean_token_accuracy": 0.6744279861450195, + "num_tokens": 38319751.0, + "step": 1484 + }, + { + "epoch": 0.16307928838128707, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 2.4568424224853516, + "learning_rate": 5.431918008784773e-07, + "loss": 1.0327, + "mean_token_accuracy": 0.6925555467605591, + "num_tokens": 38344590.0, + "step": 1485 + }, + { + "epoch": 0.16318910608390072, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 2.394148588180542, + "learning_rate": 5.435578330893118e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.7019158601760864, + "num_tokens": 38369984.0, + "step": 1486 + }, + { + "epoch": 0.1632989237865144, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 2.5806565284729004, + "learning_rate": 5.439238653001463e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.7000102996826172, + "num_tokens": 38392266.0, + "step": 1487 + }, + { + "epoch": 0.16340874148912804, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 2.0774002075195312, + "learning_rate": 5.44289897510981e-07, + "loss": 1.1787, + "mean_token_accuracy": 0.6595057249069214, + "num_tokens": 38426150.0, + "step": 1488 + }, + { + "epoch": 0.1635185591917417, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 2.1354212760925293, + "learning_rate": 5.446559297218155e-07, + "loss": 1.0181, + "mean_token_accuracy": 0.7029708623886108, + "num_tokens": 38458153.0, + "step": 1489 + }, + { + "epoch": 0.16362837689435536, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 2.7297768592834473, + "learning_rate": 5.4502196193265e-07, + "loss": 1.0523, + "mean_token_accuracy": 0.6949508190155029, + "num_tokens": 38478408.0, + "step": 1490 + }, + { + "epoch": 0.16373819459696903, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 2.263817071914673, + "learning_rate": 5.453879941434846e-07, + "loss": 0.9706, + "mean_token_accuracy": 0.7048112750053406, + "num_tokens": 38507099.0, + "step": 1491 + }, + { + "epoch": 0.16384801229958268, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.3002779483795166, + "learning_rate": 5.457540263543191e-07, + "loss": 1.0413, + "mean_token_accuracy": 0.6914834976196289, + "num_tokens": 38532923.0, + "step": 1492 + }, + { + "epoch": 0.16395783000219635, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.387904405593872, + "learning_rate": 5.461200585651538e-07, + "loss": 1.1142, + "mean_token_accuracy": 0.6739556193351746, + "num_tokens": 38558137.0, + "step": 1493 + }, + { + "epoch": 0.16406764770481003, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.271482229232788, + "learning_rate": 5.464860907759883e-07, + "loss": 1.0539, + "mean_token_accuracy": 0.6888034343719482, + "num_tokens": 38584890.0, + "step": 1494 + }, + { + "epoch": 0.16417746540742367, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 2.3129491806030273, + "learning_rate": 5.468521229868228e-07, + "loss": 1.058, + "mean_token_accuracy": 0.6863123774528503, + "num_tokens": 38611310.0, + "step": 1495 + }, + { + "epoch": 0.16428728311003735, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 2.3630740642547607, + "learning_rate": 5.472181551976574e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.6764318943023682, + "num_tokens": 38638118.0, + "step": 1496 + }, + { + "epoch": 0.164397100812651, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.598862409591675, + "learning_rate": 5.475841874084919e-07, + "loss": 0.9698, + "mean_token_accuracy": 0.707473635673523, + "num_tokens": 38660015.0, + "step": 1497 + }, + { + "epoch": 0.16450691851526467, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 2.315598249435425, + "learning_rate": 5.479502196193265e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6799057722091675, + "num_tokens": 38688580.0, + "step": 1498 + }, + { + "epoch": 0.1646167362178783, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.4709181785583496, + "learning_rate": 5.483162518301611e-07, + "loss": 1.0901, + "mean_token_accuracy": 0.6787328720092773, + "num_tokens": 38714193.0, + "step": 1499 + }, + { + "epoch": 0.16472655392049199, + "ewc_loss": 3.7401914596557617e-06, + "grad_norm": 2.2816498279571533, + "learning_rate": 5.486822840409956e-07, + "loss": 1.0794, + "mean_token_accuracy": 0.6802910566329956, + "num_tokens": 38742448.0, + "step": 1500 + }, + { + "epoch": 0.16483637162310563, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.4620747566223145, + "learning_rate": 5.490483162518301e-07, + "loss": 0.9871, + "mean_token_accuracy": 0.7079832553863525, + "num_tokens": 38765620.0, + "step": 1501 + }, + { + "epoch": 0.1649461893257193, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.4706180095672607, + "learning_rate": 5.494143484626647e-07, + "loss": 0.9619, + "mean_token_accuracy": 0.712846577167511, + "num_tokens": 38788077.0, + "step": 1502 + }, + { + "epoch": 0.16505600702833298, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.5119500160217285, + "learning_rate": 5.497803806734992e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.6983769536018372, + "num_tokens": 38811822.0, + "step": 1503 + }, + { + "epoch": 0.16516582473094663, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.4605586528778076, + "learning_rate": 5.501464128843338e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6893837451934814, + "num_tokens": 38835722.0, + "step": 1504 + }, + { + "epoch": 0.1652756424335603, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.438587188720703, + "learning_rate": 5.505124450951684e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.7004365921020508, + "num_tokens": 38859889.0, + "step": 1505 + }, + { + "epoch": 0.16538546013617395, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.2117598056793213, + "learning_rate": 5.508784773060029e-07, + "loss": 0.9982, + "mean_token_accuracy": 0.7142431735992432, + "num_tokens": 38890641.0, + "step": 1506 + }, + { + "epoch": 0.16549527783878762, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.4714126586914062, + "learning_rate": 5.512445095168374e-07, + "loss": 1.1048, + "mean_token_accuracy": 0.6789265275001526, + "num_tokens": 38914684.0, + "step": 1507 + }, + { + "epoch": 0.16560509554140126, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.751336097717285, + "learning_rate": 5.51610541727672e-07, + "loss": 1.0069, + "mean_token_accuracy": 0.7031275033950806, + "num_tokens": 38935499.0, + "step": 1508 + }, + { + "epoch": 0.16571491324401494, + "ewc_loss": 3.769993782043457e-06, + "grad_norm": 2.339306592941284, + "learning_rate": 5.519765739385066e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.686759352684021, + "num_tokens": 38960393.0, + "step": 1509 + }, + { + "epoch": 0.16582473094662858, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 2.605607748031616, + "learning_rate": 5.523426061493412e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.6899511814117432, + "num_tokens": 38983330.0, + "step": 1510 + }, + { + "epoch": 0.16593454864924226, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 2.2169196605682373, + "learning_rate": 5.527086383601757e-07, + "loss": 1.1364, + "mean_token_accuracy": 0.6675795316696167, + "num_tokens": 39011686.0, + "step": 1511 + }, + { + "epoch": 0.16604436635185593, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.384061098098755, + "learning_rate": 5.530746705710102e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.688218891620636, + "num_tokens": 39037527.0, + "step": 1512 + }, + { + "epoch": 0.16615418405446958, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.6082561016082764, + "learning_rate": 5.534407027818448e-07, + "loss": 1.1084, + "mean_token_accuracy": 0.6712242960929871, + "num_tokens": 39061584.0, + "step": 1513 + }, + { + "epoch": 0.16626400175708325, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.5050320625305176, + "learning_rate": 5.538067349926794e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.6949090957641602, + "num_tokens": 39085730.0, + "step": 1514 + }, + { + "epoch": 0.1663738194596969, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.9235544204711914, + "learning_rate": 5.541727672035139e-07, + "loss": 1.0348, + "mean_token_accuracy": 0.6981698870658875, + "num_tokens": 39103747.0, + "step": 1515 + }, + { + "epoch": 0.16648363716231057, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.6007566452026367, + "learning_rate": 5.545387994143485e-07, + "loss": 0.9896, + "mean_token_accuracy": 0.6996016502380371, + "num_tokens": 39124300.0, + "step": 1516 + }, + { + "epoch": 0.16659345486492422, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.457429885864258, + "learning_rate": 5.54904831625183e-07, + "loss": 0.957, + "mean_token_accuracy": 0.708320677280426, + "num_tokens": 39148700.0, + "step": 1517 + }, + { + "epoch": 0.1667032725675379, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 7.166348457336426, + "learning_rate": 5.552708638360175e-07, + "loss": 1.1181, + "mean_token_accuracy": 0.6677532196044922, + "num_tokens": 39172508.0, + "step": 1518 + }, + { + "epoch": 0.16681309027015154, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 2.3003251552581787, + "learning_rate": 5.556368960468521e-07, + "loss": 0.9639, + "mean_token_accuracy": 0.7099064588546753, + "num_tokens": 39197782.0, + "step": 1519 + }, + { + "epoch": 0.1669229079727652, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.278984308242798, + "learning_rate": 5.560029282576867e-07, + "loss": 1.0153, + "mean_token_accuracy": 0.6982141733169556, + "num_tokens": 39224616.0, + "step": 1520 + }, + { + "epoch": 0.16703272567537888, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.3812663555145264, + "learning_rate": 5.563689604685212e-07, + "loss": 1.0039, + "mean_token_accuracy": 0.6995648145675659, + "num_tokens": 39249771.0, + "step": 1521 + }, + { + "epoch": 0.16714254337799253, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.514058828353882, + "learning_rate": 5.567349926793558e-07, + "loss": 0.9344, + "mean_token_accuracy": 0.714192271232605, + "num_tokens": 39271524.0, + "step": 1522 + }, + { + "epoch": 0.1672523610806062, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.291283369064331, + "learning_rate": 5.571010248901903e-07, + "loss": 1.0043, + "mean_token_accuracy": 0.6996513605117798, + "num_tokens": 39299086.0, + "step": 1523 + }, + { + "epoch": 0.16736217878321985, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.124164581298828, + "learning_rate": 5.574670571010248e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.7040858864784241, + "num_tokens": 39331423.0, + "step": 1524 + }, + { + "epoch": 0.16747199648583352, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.6315524578094482, + "learning_rate": 5.578330893118595e-07, + "loss": 1.0704, + "mean_token_accuracy": 0.6829168200492859, + "num_tokens": 39352747.0, + "step": 1525 + }, + { + "epoch": 0.16758181418844717, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.453972816467285, + "learning_rate": 5.58199121522694e-07, + "loss": 1.0845, + "mean_token_accuracy": 0.6828954219818115, + "num_tokens": 39377552.0, + "step": 1526 + }, + { + "epoch": 0.16769163189106084, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.433384418487549, + "learning_rate": 5.585651537335286e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6854379177093506, + "num_tokens": 39401174.0, + "step": 1527 + }, + { + "epoch": 0.1678014495936745, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.237405776977539, + "learning_rate": 5.589311859443631e-07, + "loss": 0.9624, + "mean_token_accuracy": 0.7105687856674194, + "num_tokens": 39427179.0, + "step": 1528 + }, + { + "epoch": 0.16791126729628816, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.5637083053588867, + "learning_rate": 5.592972181551976e-07, + "loss": 0.9553, + "mean_token_accuracy": 0.7078632116317749, + "num_tokens": 39446853.0, + "step": 1529 + }, + { + "epoch": 0.1680210849989018, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.576660394668579, + "learning_rate": 5.596632503660323e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7121659517288208, + "num_tokens": 39467826.0, + "step": 1530 + }, + { + "epoch": 0.16813090270151548, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.3706154823303223, + "learning_rate": 5.600292825768668e-07, + "loss": 0.9248, + "mean_token_accuracy": 0.7170252799987793, + "num_tokens": 39492608.0, + "step": 1531 + }, + { + "epoch": 0.16824072040412916, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 2.668701648712158, + "learning_rate": 5.603953147877013e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6898029446601868, + "num_tokens": 39513984.0, + "step": 1532 + }, + { + "epoch": 0.1683505381067428, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.253544569015503, + "learning_rate": 5.607613469985359e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.7033799886703491, + "num_tokens": 39541191.0, + "step": 1533 + }, + { + "epoch": 0.16846035580935648, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 2.3963980674743652, + "learning_rate": 5.611273792093704e-07, + "loss": 0.9945, + "mean_token_accuracy": 0.7158511877059937, + "num_tokens": 39565937.0, + "step": 1534 + }, + { + "epoch": 0.16857017351197012, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.776663303375244, + "learning_rate": 5.614934114202049e-07, + "loss": 1.0368, + "mean_token_accuracy": 0.6961472034454346, + "num_tokens": 39585586.0, + "step": 1535 + }, + { + "epoch": 0.1686799912145838, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.138077974319458, + "learning_rate": 5.618594436310396e-07, + "loss": 1.1511, + "mean_token_accuracy": 0.6641238927841187, + "num_tokens": 39621116.0, + "step": 1536 + }, + { + "epoch": 0.16878980891719744, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.555785655975342, + "learning_rate": 5.622254758418741e-07, + "loss": 1.0834, + "mean_token_accuracy": 0.6830112934112549, + "num_tokens": 39645545.0, + "step": 1537 + }, + { + "epoch": 0.16889962661981112, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.383892774581909, + "learning_rate": 5.625915080527086e-07, + "loss": 1.0821, + "mean_token_accuracy": 0.6874599456787109, + "num_tokens": 39671881.0, + "step": 1538 + }, + { + "epoch": 0.16900944432242476, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.5168540477752686, + "learning_rate": 5.629575402635432e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6944419145584106, + "num_tokens": 39695290.0, + "step": 1539 + }, + { + "epoch": 0.16911926202503844, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.3163301944732666, + "learning_rate": 5.633235724743777e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.7001670002937317, + "num_tokens": 39722135.0, + "step": 1540 + }, + { + "epoch": 0.1692290797276521, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.357296943664551, + "learning_rate": 5.636896046852124e-07, + "loss": 1.1607, + "mean_token_accuracy": 0.6604369282722473, + "num_tokens": 39752087.0, + "step": 1541 + }, + { + "epoch": 0.16933889743026576, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.192047357559204, + "learning_rate": 5.640556368960469e-07, + "loss": 1.036, + "mean_token_accuracy": 0.6934807300567627, + "num_tokens": 39782108.0, + "step": 1542 + }, + { + "epoch": 0.16944871513287943, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.245610475540161, + "learning_rate": 5.644216691068814e-07, + "loss": 1.083, + "mean_token_accuracy": 0.6749152541160583, + "num_tokens": 39810707.0, + "step": 1543 + }, + { + "epoch": 0.16955853283549308, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.32936692237854, + "learning_rate": 5.64787701317716e-07, + "loss": 0.9987, + "mean_token_accuracy": 0.6997640132904053, + "num_tokens": 39836838.0, + "step": 1544 + }, + { + "epoch": 0.16966835053810675, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 2.457759141921997, + "learning_rate": 5.651537335285505e-07, + "loss": 0.9786, + "mean_token_accuracy": 0.7106056213378906, + "num_tokens": 39860798.0, + "step": 1545 + }, + { + "epoch": 0.1697781682407204, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 3.8471662998199463, + "learning_rate": 5.655197657393851e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.6721370816230774, + "num_tokens": 39886558.0, + "step": 1546 + }, + { + "epoch": 0.16988798594333407, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.678292989730835, + "learning_rate": 5.658857979502197e-07, + "loss": 0.958, + "mean_token_accuracy": 0.7067670822143555, + "num_tokens": 39907445.0, + "step": 1547 + }, + { + "epoch": 0.16999780364594771, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.236088752746582, + "learning_rate": 5.662518301610542e-07, + "loss": 0.9565, + "mean_token_accuracy": 0.7176656723022461, + "num_tokens": 39937347.0, + "step": 1548 + }, + { + "epoch": 0.1701076213485614, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.2559561729431152, + "learning_rate": 5.666178623718887e-07, + "loss": 0.9951, + "mean_token_accuracy": 0.7052809596061707, + "num_tokens": 39966512.0, + "step": 1549 + }, + { + "epoch": 0.17021743905117506, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.4629499912261963, + "learning_rate": 5.669838945827233e-07, + "loss": 1.0792, + "mean_token_accuracy": 0.6798583269119263, + "num_tokens": 39993941.0, + "step": 1550 + }, + { + "epoch": 0.1703272567537887, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.9344258308410645, + "learning_rate": 5.673499267935577e-07, + "loss": 0.8982, + "mean_token_accuracy": 0.7315363883972168, + "num_tokens": 40012552.0, + "step": 1551 + }, + { + "epoch": 0.17043707445640238, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.163889169692993, + "learning_rate": 5.677159590043924e-07, + "loss": 1.0679, + "mean_token_accuracy": 0.6866980195045471, + "num_tokens": 40043398.0, + "step": 1552 + }, + { + "epoch": 0.17054689215901603, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.4459540843963623, + "learning_rate": 5.68081991215227e-07, + "loss": 1.1038, + "mean_token_accuracy": 0.6768288612365723, + "num_tokens": 40070386.0, + "step": 1553 + }, + { + "epoch": 0.1706567098616297, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.5121817588806152, + "learning_rate": 5.684480234260615e-07, + "loss": 1.0319, + "mean_token_accuracy": 0.6950927972793579, + "num_tokens": 40094339.0, + "step": 1554 + }, + { + "epoch": 0.17076652756424335, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 2.464698553085327, + "learning_rate": 5.688140556368959e-07, + "loss": 0.9462, + "mean_token_accuracy": 0.7211987376213074, + "num_tokens": 40118938.0, + "step": 1555 + }, + { + "epoch": 0.17087634526685702, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.3187665939331055, + "learning_rate": 5.691800878477305e-07, + "loss": 1.1216, + "mean_token_accuracy": 0.6676060557365417, + "num_tokens": 40148530.0, + "step": 1556 + }, + { + "epoch": 0.17098616296947067, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.382444143295288, + "learning_rate": 5.695461200585652e-07, + "loss": 1.0766, + "mean_token_accuracy": 0.687052845954895, + "num_tokens": 40175048.0, + "step": 1557 + }, + { + "epoch": 0.17109598067208434, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.5149924755096436, + "learning_rate": 5.699121522693998e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.6928260326385498, + "num_tokens": 40198815.0, + "step": 1558 + }, + { + "epoch": 0.17120579837469801, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.496276378631592, + "learning_rate": 5.702781844802342e-07, + "loss": 1.006, + "mean_token_accuracy": 0.7073419690132141, + "num_tokens": 40221652.0, + "step": 1559 + }, + { + "epoch": 0.17131561607731166, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.3097870349884033, + "learning_rate": 5.706442166910687e-07, + "loss": 1.0219, + "mean_token_accuracy": 0.6961365938186646, + "num_tokens": 40248573.0, + "step": 1560 + }, + { + "epoch": 0.17142543377992533, + "ewc_loss": 3.9637088775634766e-06, + "grad_norm": 2.5709445476531982, + "learning_rate": 5.710102489019033e-07, + "loss": 1.0965, + "mean_token_accuracy": 0.6809810996055603, + "num_tokens": 40273242.0, + "step": 1561 + }, + { + "epoch": 0.17153525148253898, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.507115602493286, + "learning_rate": 5.71376281112738e-07, + "loss": 1.0213, + "mean_token_accuracy": 0.6956870555877686, + "num_tokens": 40295379.0, + "step": 1562 + }, + { + "epoch": 0.17164506918515265, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.919949531555176, + "learning_rate": 5.717423133235724e-07, + "loss": 1.0255, + "mean_token_accuracy": 0.693596601486206, + "num_tokens": 40313918.0, + "step": 1563 + }, + { + "epoch": 0.1717548868877663, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.4306020736694336, + "learning_rate": 5.72108345534407e-07, + "loss": 1.0426, + "mean_token_accuracy": 0.6918017864227295, + "num_tokens": 40338811.0, + "step": 1564 + }, + { + "epoch": 0.17186470459037997, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.4868264198303223, + "learning_rate": 5.724743777452415e-07, + "loss": 1.049, + "mean_token_accuracy": 0.6884647011756897, + "num_tokens": 40361823.0, + "step": 1565 + }, + { + "epoch": 0.17197452229299362, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.4664955139160156, + "learning_rate": 5.72840409956076e-07, + "loss": 1.0834, + "mean_token_accuracy": 0.6887696981430054, + "num_tokens": 40386526.0, + "step": 1566 + }, + { + "epoch": 0.1720843399956073, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.3524434566497803, + "learning_rate": 5.732064421669106e-07, + "loss": 1.0093, + "mean_token_accuracy": 0.7011082172393799, + "num_tokens": 40412670.0, + "step": 1567 + }, + { + "epoch": 0.17219415769822094, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.345594644546509, + "learning_rate": 5.735724743777452e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.6955732703208923, + "num_tokens": 40439359.0, + "step": 1568 + }, + { + "epoch": 0.1723039754008346, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.3056223392486572, + "learning_rate": 5.739385065885797e-07, + "loss": 1.0235, + "mean_token_accuracy": 0.6985570192337036, + "num_tokens": 40466494.0, + "step": 1569 + }, + { + "epoch": 0.1724137931034483, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.609140396118164, + "learning_rate": 5.743045387994143e-07, + "loss": 0.9814, + "mean_token_accuracy": 0.7035735845565796, + "num_tokens": 40488412.0, + "step": 1570 + }, + { + "epoch": 0.17252361080606193, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 2.4567677974700928, + "learning_rate": 5.746705710102488e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6816680431365967, + "num_tokens": 40511460.0, + "step": 1571 + }, + { + "epoch": 0.1726334285086756, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.2702620029449463, + "learning_rate": 5.750366032210833e-07, + "loss": 0.9947, + "mean_token_accuracy": 0.7058907747268677, + "num_tokens": 40537397.0, + "step": 1572 + }, + { + "epoch": 0.17274324621128925, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.716569423675537, + "learning_rate": 5.75402635431918e-07, + "loss": 1.0023, + "mean_token_accuracy": 0.6997243762016296, + "num_tokens": 40556838.0, + "step": 1573 + }, + { + "epoch": 0.17285306391390293, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.8582072257995605, + "learning_rate": 5.757686676427525e-07, + "loss": 1.013, + "mean_token_accuracy": 0.6945913434028625, + "num_tokens": 40575543.0, + "step": 1574 + }, + { + "epoch": 0.17296288161651657, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.2420318126678467, + "learning_rate": 5.761346998535871e-07, + "loss": 1.067, + "mean_token_accuracy": 0.6869481801986694, + "num_tokens": 40604152.0, + "step": 1575 + }, + { + "epoch": 0.17307269931913025, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.4693071842193604, + "learning_rate": 5.765007320644216e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6864616274833679, + "num_tokens": 40627574.0, + "step": 1576 + }, + { + "epoch": 0.1731825170217439, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.3421175479888916, + "learning_rate": 5.768667642752561e-07, + "loss": 1.0015, + "mean_token_accuracy": 0.6994099020957947, + "num_tokens": 40651057.0, + "step": 1577 + }, + { + "epoch": 0.17329233472435757, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.297266721725464, + "learning_rate": 5.772327964860908e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.6887686252593994, + "num_tokens": 40678662.0, + "step": 1578 + }, + { + "epoch": 0.17340215242697124, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 2.3917908668518066, + "learning_rate": 5.775988286969253e-07, + "loss": 1.0999, + "mean_token_accuracy": 0.6834765672683716, + "num_tokens": 40704905.0, + "step": 1579 + }, + { + "epoch": 0.17351197012958489, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.665339708328247, + "learning_rate": 5.779648609077598e-07, + "loss": 0.9641, + "mean_token_accuracy": 0.7115200757980347, + "num_tokens": 40725916.0, + "step": 1580 + }, + { + "epoch": 0.17362178783219856, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.5969223976135254, + "learning_rate": 5.783308931185944e-07, + "loss": 1.0628, + "mean_token_accuracy": 0.6847897171974182, + "num_tokens": 40746748.0, + "step": 1581 + }, + { + "epoch": 0.1737316055348122, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.2410640716552734, + "learning_rate": 5.786969253294289e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6853362321853638, + "num_tokens": 40779091.0, + "step": 1582 + }, + { + "epoch": 0.17384142323742588, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.1469955444335938, + "learning_rate": 5.790629575402634e-07, + "loss": 1.1124, + "mean_token_accuracy": 0.6711179614067078, + "num_tokens": 40809761.0, + "step": 1583 + }, + { + "epoch": 0.17395124094003953, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.1590425968170166, + "learning_rate": 5.794289897510981e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6853319406509399, + "num_tokens": 40841486.0, + "step": 1584 + }, + { + "epoch": 0.1740610586426532, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.1796364784240723, + "learning_rate": 5.797950219619326e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6826847195625305, + "num_tokens": 40870417.0, + "step": 1585 + }, + { + "epoch": 0.17417087634526685, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 2.250671863555908, + "learning_rate": 5.801610541727671e-07, + "loss": 1.0227, + "mean_token_accuracy": 0.7023583054542542, + "num_tokens": 40898422.0, + "step": 1586 + }, + { + "epoch": 0.17428069404788052, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.5907492637634277, + "learning_rate": 5.805270863836017e-07, + "loss": 1.137, + "mean_token_accuracy": 0.669028639793396, + "num_tokens": 40921246.0, + "step": 1587 + }, + { + "epoch": 0.1743905117504942, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.314516544342041, + "learning_rate": 5.808931185944362e-07, + "loss": 1.0248, + "mean_token_accuracy": 0.6897822618484497, + "num_tokens": 40947697.0, + "step": 1588 + }, + { + "epoch": 0.17450032945310784, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.3826076984405518, + "learning_rate": 5.812591508052709e-07, + "loss": 0.9503, + "mean_token_accuracy": 0.7156261801719666, + "num_tokens": 40971954.0, + "step": 1589 + }, + { + "epoch": 0.1746101471557215, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.2220702171325684, + "learning_rate": 5.816251830161054e-07, + "loss": 1.0675, + "mean_token_accuracy": 0.6807459592819214, + "num_tokens": 41003026.0, + "step": 1590 + }, + { + "epoch": 0.17471996485833516, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.435915470123291, + "learning_rate": 5.819912152269399e-07, + "loss": 1.0752, + "mean_token_accuracy": 0.6798471212387085, + "num_tokens": 41027889.0, + "step": 1591 + }, + { + "epoch": 0.17482978256094883, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.7250492572784424, + "learning_rate": 5.823572474377745e-07, + "loss": 1.0432, + "mean_token_accuracy": 0.6890032291412354, + "num_tokens": 41048983.0, + "step": 1592 + }, + { + "epoch": 0.17493960026356248, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.418473720550537, + "learning_rate": 5.82723279648609e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6723891496658325, + "num_tokens": 41074025.0, + "step": 1593 + }, + { + "epoch": 0.17504941796617615, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.2661948204040527, + "learning_rate": 5.830893118594436e-07, + "loss": 1.0948, + "mean_token_accuracy": 0.6761396527290344, + "num_tokens": 41104142.0, + "step": 1594 + }, + { + "epoch": 0.1751592356687898, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.4525561332702637, + "learning_rate": 5.834553440702782e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6875842809677124, + "num_tokens": 41128610.0, + "step": 1595 + }, + { + "epoch": 0.17526905337140347, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.3084371089935303, + "learning_rate": 5.838213762811127e-07, + "loss": 1.0013, + "mean_token_accuracy": 0.7021129131317139, + "num_tokens": 41154215.0, + "step": 1596 + }, + { + "epoch": 0.17537887107401715, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.240354537963867, + "learning_rate": 5.841874084919472e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.673507571220398, + "num_tokens": 41186683.0, + "step": 1597 + }, + { + "epoch": 0.1754886887766308, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.4072580337524414, + "learning_rate": 5.845534407027818e-07, + "loss": 1.0101, + "mean_token_accuracy": 0.6937150955200195, + "num_tokens": 41211138.0, + "step": 1598 + }, + { + "epoch": 0.17559850647924446, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.8812949657440186, + "learning_rate": 5.849194729136163e-07, + "loss": 0.938, + "mean_token_accuracy": 0.717313289642334, + "num_tokens": 41229133.0, + "step": 1599 + }, + { + "epoch": 0.1757083241818581, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.510469436645508, + "learning_rate": 5.852855051244509e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6876757144927979, + "num_tokens": 41254358.0, + "step": 1600 + }, + { + "epoch": 0.17581814188447178, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.279064655303955, + "learning_rate": 5.856515373352855e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6841310858726501, + "num_tokens": 41282152.0, + "step": 1601 + }, + { + "epoch": 0.17592795958708543, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.411909580230713, + "learning_rate": 5.8601756954612e-07, + "loss": 0.974, + "mean_token_accuracy": 0.7140673398971558, + "num_tokens": 41307760.0, + "step": 1602 + }, + { + "epoch": 0.1760377772896991, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.3480770587921143, + "learning_rate": 5.863836017569545e-07, + "loss": 0.9927, + "mean_token_accuracy": 0.7025796175003052, + "num_tokens": 41333522.0, + "step": 1603 + }, + { + "epoch": 0.17614759499231275, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 2.3993759155273438, + "learning_rate": 5.867496339677891e-07, + "loss": 1.028, + "mean_token_accuracy": 0.6974024176597595, + "num_tokens": 41357614.0, + "step": 1604 + }, + { + "epoch": 0.17625741269492642, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.3425111770629883, + "learning_rate": 5.871156661786237e-07, + "loss": 0.9813, + "mean_token_accuracy": 0.7094821929931641, + "num_tokens": 41383591.0, + "step": 1605 + }, + { + "epoch": 0.17636723039754007, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.362058639526367, + "learning_rate": 5.874816983894583e-07, + "loss": 1.108, + "mean_token_accuracy": 0.6735132932662964, + "num_tokens": 41410821.0, + "step": 1606 + }, + { + "epoch": 0.17647704810015374, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.5719656944274902, + "learning_rate": 5.878477306002928e-07, + "loss": 1.0796, + "mean_token_accuracy": 0.6847983002662659, + "num_tokens": 41433212.0, + "step": 1607 + }, + { + "epoch": 0.17658686580276742, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.3294284343719482, + "learning_rate": 5.882137628111273e-07, + "loss": 1.1546, + "mean_token_accuracy": 0.6594511270523071, + "num_tokens": 41461618.0, + "step": 1608 + }, + { + "epoch": 0.17669668350538106, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.2289886474609375, + "learning_rate": 5.885797950219619e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6816161870956421, + "num_tokens": 41490369.0, + "step": 1609 + }, + { + "epoch": 0.17680650120799474, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.3771119117736816, + "learning_rate": 5.889458272327965e-07, + "loss": 1.2212, + "mean_token_accuracy": 0.6470344662666321, + "num_tokens": 41518493.0, + "step": 1610 + }, + { + "epoch": 0.17691631891060838, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.300858736038208, + "learning_rate": 5.89311859443631e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.6732364892959595, + "num_tokens": 41545826.0, + "step": 1611 + }, + { + "epoch": 0.17702613661322206, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.5051231384277344, + "learning_rate": 5.896778916544656e-07, + "loss": 1.0244, + "mean_token_accuracy": 0.6907700300216675, + "num_tokens": 41568877.0, + "step": 1612 + }, + { + "epoch": 0.1771359543158357, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.419966697692871, + "learning_rate": 5.900439238653001e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7040177583694458, + "num_tokens": 41594228.0, + "step": 1613 + }, + { + "epoch": 0.17724577201844938, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.459862470626831, + "learning_rate": 5.904099560761346e-07, + "loss": 1.1011, + "mean_token_accuracy": 0.6794344186782837, + "num_tokens": 41620861.0, + "step": 1614 + }, + { + "epoch": 0.17735558972106302, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.3976707458496094, + "learning_rate": 5.907759882869692e-07, + "loss": 1.0997, + "mean_token_accuracy": 0.680262565612793, + "num_tokens": 41646859.0, + "step": 1615 + }, + { + "epoch": 0.1774654074236767, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.4954254627227783, + "learning_rate": 5.911420204978038e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.6870160102844238, + "num_tokens": 41670100.0, + "step": 1616 + }, + { + "epoch": 0.17757522512629037, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.6419975757598877, + "learning_rate": 5.915080527086383e-07, + "loss": 1.0014, + "mean_token_accuracy": 0.705856204032898, + "num_tokens": 41690417.0, + "step": 1617 + }, + { + "epoch": 0.17768504282890402, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.044508218765259, + "learning_rate": 5.918740849194729e-07, + "loss": 0.966, + "mean_token_accuracy": 0.719251275062561, + "num_tokens": 41723702.0, + "step": 1618 + }, + { + "epoch": 0.1777948605315177, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.372087240219116, + "learning_rate": 5.922401171303074e-07, + "loss": 0.9825, + "mean_token_accuracy": 0.7033032178878784, + "num_tokens": 41747292.0, + "step": 1619 + }, + { + "epoch": 0.17790467823413134, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.614370822906494, + "learning_rate": 5.926061493411419e-07, + "loss": 1.0857, + "mean_token_accuracy": 0.6798523664474487, + "num_tokens": 41771664.0, + "step": 1620 + }, + { + "epoch": 0.178014495936745, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.2556419372558594, + "learning_rate": 5.929721815519766e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6764050126075745, + "num_tokens": 41802272.0, + "step": 1621 + }, + { + "epoch": 0.17812431363935866, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.364565849304199, + "learning_rate": 5.933382137628111e-07, + "loss": 1.0373, + "mean_token_accuracy": 0.6890161037445068, + "num_tokens": 41826323.0, + "step": 1622 + }, + { + "epoch": 0.17823413134197233, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.1800942420959473, + "learning_rate": 5.937042459736457e-07, + "loss": 1.1462, + "mean_token_accuracy": 0.6657683849334717, + "num_tokens": 41859043.0, + "step": 1623 + }, + { + "epoch": 0.17834394904458598, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.611636161804199, + "learning_rate": 5.940702781844802e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6983007192611694, + "num_tokens": 41880055.0, + "step": 1624 + }, + { + "epoch": 0.17845376674719965, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.4051449298858643, + "learning_rate": 5.944363103953147e-07, + "loss": 1.0764, + "mean_token_accuracy": 0.6785317063331604, + "num_tokens": 41906987.0, + "step": 1625 + }, + { + "epoch": 0.17856358444981332, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.532042980194092, + "learning_rate": 5.948023426061494e-07, + "loss": 1.0514, + "mean_token_accuracy": 0.699270486831665, + "num_tokens": 41929196.0, + "step": 1626 + }, + { + "epoch": 0.17867340215242697, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.4053871631622314, + "learning_rate": 5.951683748169839e-07, + "loss": 1.1585, + "mean_token_accuracy": 0.6659165620803833, + "num_tokens": 41955354.0, + "step": 1627 + }, + { + "epoch": 0.17878321985504064, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.344756603240967, + "learning_rate": 5.955344070278184e-07, + "loss": 1.1574, + "mean_token_accuracy": 0.6578185558319092, + "num_tokens": 41982518.0, + "step": 1628 + }, + { + "epoch": 0.1788930375576543, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 2.3813328742980957, + "learning_rate": 5.95900439238653e-07, + "loss": 1.0084, + "mean_token_accuracy": 0.6974092721939087, + "num_tokens": 42006184.0, + "step": 1629 + }, + { + "epoch": 0.17900285526026796, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 3.0152504444122314, + "learning_rate": 5.962664714494875e-07, + "loss": 1.0029, + "mean_token_accuracy": 0.70109623670578, + "num_tokens": 42025392.0, + "step": 1630 + }, + { + "epoch": 0.1791126729628816, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.2424416542053223, + "learning_rate": 5.96632503660322e-07, + "loss": 1.1858, + "mean_token_accuracy": 0.6602989435195923, + "num_tokens": 42055514.0, + "step": 1631 + }, + { + "epoch": 0.17922249066549528, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.4824531078338623, + "learning_rate": 5.969985358711567e-07, + "loss": 1.0811, + "mean_token_accuracy": 0.6824007630348206, + "num_tokens": 42081033.0, + "step": 1632 + }, + { + "epoch": 0.17933230836810893, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.5164666175842285, + "learning_rate": 5.973645680819912e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.7104356288909912, + "num_tokens": 42104399.0, + "step": 1633 + }, + { + "epoch": 0.1794421260707226, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.4132778644561768, + "learning_rate": 5.977306002928257e-07, + "loss": 1.0653, + "mean_token_accuracy": 0.6824904084205627, + "num_tokens": 42128912.0, + "step": 1634 + }, + { + "epoch": 0.17955194377333628, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.7182304859161377, + "learning_rate": 5.980966325036603e-07, + "loss": 1.1013, + "mean_token_accuracy": 0.6760286092758179, + "num_tokens": 42150892.0, + "step": 1635 + }, + { + "epoch": 0.17966176147594992, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.5625250339508057, + "learning_rate": 5.984626647144948e-07, + "loss": 0.9865, + "mean_token_accuracy": 0.7117516994476318, + "num_tokens": 42172510.0, + "step": 1636 + }, + { + "epoch": 0.1797715791785636, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.427651882171631, + "learning_rate": 5.988286969253294e-07, + "loss": 1.0627, + "mean_token_accuracy": 0.6834865808486938, + "num_tokens": 42199310.0, + "step": 1637 + }, + { + "epoch": 0.17988139688117724, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 2.252523183822632, + "learning_rate": 5.99194729136164e-07, + "loss": 1.0979, + "mean_token_accuracy": 0.6739680171012878, + "num_tokens": 42229990.0, + "step": 1638 + }, + { + "epoch": 0.17999121458379091, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.694462299346924, + "learning_rate": 5.995607613469985e-07, + "loss": 0.9687, + "mean_token_accuracy": 0.711051344871521, + "num_tokens": 42249858.0, + "step": 1639 + }, + { + "epoch": 0.18010103228640456, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.6204278469085693, + "learning_rate": 5.999267935578331e-07, + "loss": 0.9637, + "mean_token_accuracy": 0.7087351083755493, + "num_tokens": 42271355.0, + "step": 1640 + }, + { + "epoch": 0.18021084998901823, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.4136226177215576, + "learning_rate": 6.002928257686676e-07, + "loss": 1.0341, + "mean_token_accuracy": 0.698241651058197, + "num_tokens": 42297624.0, + "step": 1641 + }, + { + "epoch": 0.18032066769163188, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.439775228500366, + "learning_rate": 6.006588579795022e-07, + "loss": 0.9741, + "mean_token_accuracy": 0.7058762311935425, + "num_tokens": 42322395.0, + "step": 1642 + }, + { + "epoch": 0.18043048539424555, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.6794049739837646, + "learning_rate": 6.010248901903368e-07, + "loss": 1.0271, + "mean_token_accuracy": 0.7076210379600525, + "num_tokens": 42344634.0, + "step": 1643 + }, + { + "epoch": 0.1805403030968592, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.4417011737823486, + "learning_rate": 6.013909224011713e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.6770945191383362, + "num_tokens": 42370501.0, + "step": 1644 + }, + { + "epoch": 0.18065012079947287, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.286123514175415, + "learning_rate": 6.017569546120058e-07, + "loss": 1.0577, + "mean_token_accuracy": 0.6908562183380127, + "num_tokens": 42396966.0, + "step": 1645 + }, + { + "epoch": 0.18075993850208655, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.3532302379608154, + "learning_rate": 6.021229868228404e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.6842414140701294, + "num_tokens": 42423033.0, + "step": 1646 + }, + { + "epoch": 0.1808697562047002, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.0237181186676025, + "learning_rate": 6.024890190336749e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6935054063796997, + "num_tokens": 42456449.0, + "step": 1647 + }, + { + "epoch": 0.18097957390731387, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.4937520027160645, + "learning_rate": 6.028550512445095e-07, + "loss": 1.0184, + "mean_token_accuracy": 0.6999000310897827, + "num_tokens": 42480035.0, + "step": 1648 + }, + { + "epoch": 0.1810893916099275, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.4831392765045166, + "learning_rate": 6.032210834553441e-07, + "loss": 1.0279, + "mean_token_accuracy": 0.6997485756874084, + "num_tokens": 42503308.0, + "step": 1649 + }, + { + "epoch": 0.1811992093125412, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.267366647720337, + "learning_rate": 6.035871156661786e-07, + "loss": 1.0936, + "mean_token_accuracy": 0.6776725053787231, + "num_tokens": 42532075.0, + "step": 1650 + }, + { + "epoch": 0.18130902701515483, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.3943958282470703, + "learning_rate": 6.039531478770131e-07, + "loss": 1.005, + "mean_token_accuracy": 0.702995777130127, + "num_tokens": 42556184.0, + "step": 1651 + }, + { + "epoch": 0.1814188447177685, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.5750410556793213, + "learning_rate": 6.043191800878477e-07, + "loss": 1.1048, + "mean_token_accuracy": 0.6802261471748352, + "num_tokens": 42578435.0, + "step": 1652 + }, + { + "epoch": 0.18152866242038215, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.414501190185547, + "learning_rate": 6.046852122986823e-07, + "loss": 1.0599, + "mean_token_accuracy": 0.685716450214386, + "num_tokens": 42604295.0, + "step": 1653 + }, + { + "epoch": 0.18163848012299583, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.395961046218872, + "learning_rate": 6.050512445095168e-07, + "loss": 1.1314, + "mean_token_accuracy": 0.6798405051231384, + "num_tokens": 42632476.0, + "step": 1654 + }, + { + "epoch": 0.1817482978256095, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.5196077823638916, + "learning_rate": 6.054172767203514e-07, + "loss": 1.1097, + "mean_token_accuracy": 0.674696683883667, + "num_tokens": 42657921.0, + "step": 1655 + }, + { + "epoch": 0.18185811552822315, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.5703375339508057, + "learning_rate": 6.057833089311859e-07, + "loss": 1.1519, + "mean_token_accuracy": 0.6638103127479553, + "num_tokens": 42683798.0, + "step": 1656 + }, + { + "epoch": 0.18196793323083682, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.4104835987091064, + "learning_rate": 6.061493411420205e-07, + "loss": 1.0053, + "mean_token_accuracy": 0.7014800310134888, + "num_tokens": 42707333.0, + "step": 1657 + }, + { + "epoch": 0.18207775093345047, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 2.3031210899353027, + "learning_rate": 6.065153733528551e-07, + "loss": 1.0569, + "mean_token_accuracy": 0.6933670043945312, + "num_tokens": 42733776.0, + "step": 1658 + }, + { + "epoch": 0.18218756863606414, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.0184481143951416, + "learning_rate": 6.068814055636896e-07, + "loss": 1.0981, + "mean_token_accuracy": 0.6769571900367737, + "num_tokens": 42769503.0, + "step": 1659 + }, + { + "epoch": 0.18229738633867779, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.608290672302246, + "learning_rate": 6.072474377745242e-07, + "loss": 1.0158, + "mean_token_accuracy": 0.7027553915977478, + "num_tokens": 42791718.0, + "step": 1660 + }, + { + "epoch": 0.18240720404129146, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.4823131561279297, + "learning_rate": 6.076134699853587e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6915068626403809, + "num_tokens": 42816722.0, + "step": 1661 + }, + { + "epoch": 0.1825170217439051, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.243273973464966, + "learning_rate": 6.079795021961932e-07, + "loss": 0.9933, + "mean_token_accuracy": 0.6994502544403076, + "num_tokens": 42844961.0, + "step": 1662 + }, + { + "epoch": 0.18262683944651878, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.3190841674804688, + "learning_rate": 6.083455344070278e-07, + "loss": 0.9504, + "mean_token_accuracy": 0.7088980674743652, + "num_tokens": 42870511.0, + "step": 1663 + }, + { + "epoch": 0.18273665714913245, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.255720376968384, + "learning_rate": 6.087115666178624e-07, + "loss": 0.9774, + "mean_token_accuracy": 0.7074929475784302, + "num_tokens": 42897649.0, + "step": 1664 + }, + { + "epoch": 0.1828464748517461, + "ewc_loss": 4.231929779052734e-06, + "grad_norm": 2.136495590209961, + "learning_rate": 6.090775988286969e-07, + "loss": 0.9868, + "mean_token_accuracy": 0.7040526270866394, + "num_tokens": 42926306.0, + "step": 1665 + }, + { + "epoch": 0.18295629255435977, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.3704047203063965, + "learning_rate": 6.094436310395315e-07, + "loss": 1.0271, + "mean_token_accuracy": 0.6936267614364624, + "num_tokens": 42951568.0, + "step": 1666 + }, + { + "epoch": 0.18306611025697342, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.498591899871826, + "learning_rate": 6.09809663250366e-07, + "loss": 1.1073, + "mean_token_accuracy": 0.6862553954124451, + "num_tokens": 42975783.0, + "step": 1667 + }, + { + "epoch": 0.1831759279595871, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.3087494373321533, + "learning_rate": 6.101756954612005e-07, + "loss": 1.1301, + "mean_token_accuracy": 0.6704069375991821, + "num_tokens": 43004209.0, + "step": 1668 + }, + { + "epoch": 0.18328574566220074, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.5461719036102295, + "learning_rate": 6.105417276720352e-07, + "loss": 1.0442, + "mean_token_accuracy": 0.6867893934249878, + "num_tokens": 43028852.0, + "step": 1669 + }, + { + "epoch": 0.1833955633648144, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.452544927597046, + "learning_rate": 6.109077598828697e-07, + "loss": 1.0151, + "mean_token_accuracy": 0.6931190490722656, + "num_tokens": 43051679.0, + "step": 1670 + }, + { + "epoch": 0.18350538106742806, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 2.3146705627441406, + "learning_rate": 6.112737920937042e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.6762261986732483, + "num_tokens": 43079437.0, + "step": 1671 + }, + { + "epoch": 0.18361519877004173, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.077528715133667, + "learning_rate": 6.116398243045388e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6808024644851685, + "num_tokens": 43111882.0, + "step": 1672 + }, + { + "epoch": 0.1837250164726554, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.4129586219787598, + "learning_rate": 6.120058565153733e-07, + "loss": 1.0805, + "mean_token_accuracy": 0.6912629008293152, + "num_tokens": 43138332.0, + "step": 1673 + }, + { + "epoch": 0.18383483417526905, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.4139692783355713, + "learning_rate": 6.12371888726208e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6861794590950012, + "num_tokens": 43163166.0, + "step": 1674 + }, + { + "epoch": 0.18394465187788273, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.1461451053619385, + "learning_rate": 6.127379209370425e-07, + "loss": 1.0298, + "mean_token_accuracy": 0.6973650455474854, + "num_tokens": 43192052.0, + "step": 1675 + }, + { + "epoch": 0.18405446958049637, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.291774034500122, + "learning_rate": 6.13103953147877e-07, + "loss": 0.9879, + "mean_token_accuracy": 0.7008029818534851, + "num_tokens": 43218645.0, + "step": 1676 + }, + { + "epoch": 0.18416428728311005, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.5478620529174805, + "learning_rate": 6.134699853587116e-07, + "loss": 1.0863, + "mean_token_accuracy": 0.6739085912704468, + "num_tokens": 43241509.0, + "step": 1677 + }, + { + "epoch": 0.1842741049857237, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.572218656539917, + "learning_rate": 6.138360175695461e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6945911049842834, + "num_tokens": 43264698.0, + "step": 1678 + }, + { + "epoch": 0.18438392268833736, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.5680840015411377, + "learning_rate": 6.142020497803806e-07, + "loss": 0.9988, + "mean_token_accuracy": 0.7065088748931885, + "num_tokens": 43287337.0, + "step": 1679 + }, + { + "epoch": 0.184493740390951, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.512558698654175, + "learning_rate": 6.145680819912153e-07, + "loss": 1.0845, + "mean_token_accuracy": 0.6837453842163086, + "num_tokens": 43313430.0, + "step": 1680 + }, + { + "epoch": 0.18460355809356468, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.4725141525268555, + "learning_rate": 6.149341142020498e-07, + "loss": 1.0587, + "mean_token_accuracy": 0.6875399351119995, + "num_tokens": 43337787.0, + "step": 1681 + }, + { + "epoch": 0.18471337579617833, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.0834338665008545, + "learning_rate": 6.153001464128843e-07, + "loss": 1.11, + "mean_token_accuracy": 0.6746338605880737, + "num_tokens": 43373378.0, + "step": 1682 + }, + { + "epoch": 0.184823193498792, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.1488146781921387, + "learning_rate": 6.156661786237189e-07, + "loss": 0.9681, + "mean_token_accuracy": 0.7099196910858154, + "num_tokens": 43404353.0, + "step": 1683 + }, + { + "epoch": 0.18493301120140568, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.2421751022338867, + "learning_rate": 6.160322108345534e-07, + "loss": 1.1005, + "mean_token_accuracy": 0.6792815923690796, + "num_tokens": 43433934.0, + "step": 1684 + }, + { + "epoch": 0.18504282890401932, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.1155123710632324, + "learning_rate": 6.16398243045388e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.6983285546302795, + "num_tokens": 43466980.0, + "step": 1685 + }, + { + "epoch": 0.185152646606633, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.255725383758545, + "learning_rate": 6.167642752562226e-07, + "loss": 1.0273, + "mean_token_accuracy": 0.7027647495269775, + "num_tokens": 43494439.0, + "step": 1686 + }, + { + "epoch": 0.18526246430924664, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.2692155838012695, + "learning_rate": 6.171303074670571e-07, + "loss": 1.1434, + "mean_token_accuracy": 0.6708329319953918, + "num_tokens": 43521827.0, + "step": 1687 + }, + { + "epoch": 0.18537228201186032, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.4797966480255127, + "learning_rate": 6.174963396778916e-07, + "loss": 1.09, + "mean_token_accuracy": 0.6739647388458252, + "num_tokens": 43546844.0, + "step": 1688 + }, + { + "epoch": 0.18548209971447396, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.334352970123291, + "learning_rate": 6.178623718887262e-07, + "loss": 1.1191, + "mean_token_accuracy": 0.6673004627227783, + "num_tokens": 43579098.0, + "step": 1689 + }, + { + "epoch": 0.18559191741708764, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.7401885986328125, + "learning_rate": 6.182284040995608e-07, + "loss": 0.9162, + "mean_token_accuracy": 0.7250176668167114, + "num_tokens": 43599155.0, + "step": 1690 + }, + { + "epoch": 0.18570173511970128, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.5192222595214844, + "learning_rate": 6.185944363103954e-07, + "loss": 1.0214, + "mean_token_accuracy": 0.6966814994812012, + "num_tokens": 43622669.0, + "step": 1691 + }, + { + "epoch": 0.18581155282231496, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 2.4899590015411377, + "learning_rate": 6.189604685212299e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6978676319122314, + "num_tokens": 43646422.0, + "step": 1692 + }, + { + "epoch": 0.18592137052492863, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.1683359146118164, + "learning_rate": 6.193265007320644e-07, + "loss": 1.012, + "mean_token_accuracy": 0.6996433138847351, + "num_tokens": 43676400.0, + "step": 1693 + }, + { + "epoch": 0.18603118822754228, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.735793352127075, + "learning_rate": 6.19692532942899e-07, + "loss": 1.0193, + "mean_token_accuracy": 0.6979636549949646, + "num_tokens": 43697193.0, + "step": 1694 + }, + { + "epoch": 0.18614100593015595, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.5565989017486572, + "learning_rate": 6.200585651537335e-07, + "loss": 1.0582, + "mean_token_accuracy": 0.6885993480682373, + "num_tokens": 43721079.0, + "step": 1695 + }, + { + "epoch": 0.1862508236327696, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.4322540760040283, + "learning_rate": 6.204245973645681e-07, + "loss": 1.0512, + "mean_token_accuracy": 0.6936454176902771, + "num_tokens": 43747631.0, + "step": 1696 + }, + { + "epoch": 0.18636064133538327, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.276397466659546, + "learning_rate": 6.207906295754027e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.6946293115615845, + "num_tokens": 43778341.0, + "step": 1697 + }, + { + "epoch": 0.18647045903799692, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.540499448776245, + "learning_rate": 6.211566617862372e-07, + "loss": 1.019, + "mean_token_accuracy": 0.6972309947013855, + "num_tokens": 43802076.0, + "step": 1698 + }, + { + "epoch": 0.1865802767406106, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.327838659286499, + "learning_rate": 6.215226939970716e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6777416467666626, + "num_tokens": 43829357.0, + "step": 1699 + }, + { + "epoch": 0.18669009444322424, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.529099941253662, + "learning_rate": 6.218887262079063e-07, + "loss": 0.9687, + "mean_token_accuracy": 0.7040126323699951, + "num_tokens": 43852687.0, + "step": 1700 + }, + { + "epoch": 0.1867999121458379, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.498595714569092, + "learning_rate": 6.222547584187409e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6909701824188232, + "num_tokens": 43877297.0, + "step": 1701 + }, + { + "epoch": 0.18690972984845158, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.5338494777679443, + "learning_rate": 6.226207906295753e-07, + "loss": 1.0069, + "mean_token_accuracy": 0.6944413781166077, + "num_tokens": 43899048.0, + "step": 1702 + }, + { + "epoch": 0.18701954755106523, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.566650867462158, + "learning_rate": 6.2298682284041e-07, + "loss": 0.9393, + "mean_token_accuracy": 0.7156239748001099, + "num_tokens": 43921618.0, + "step": 1703 + }, + { + "epoch": 0.1871293652536789, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.0804951190948486, + "learning_rate": 6.233528550512444e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6887548565864563, + "num_tokens": 43952213.0, + "step": 1704 + }, + { + "epoch": 0.18723918295629255, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.215742588043213, + "learning_rate": 6.237188872620789e-07, + "loss": 1.0199, + "mean_token_accuracy": 0.701485276222229, + "num_tokens": 43979710.0, + "step": 1705 + }, + { + "epoch": 0.18734900065890622, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.3533389568328857, + "learning_rate": 6.240849194729137e-07, + "loss": 1.0215, + "mean_token_accuracy": 0.696033775806427, + "num_tokens": 44005736.0, + "step": 1706 + }, + { + "epoch": 0.18745881836151987, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.093242645263672, + "learning_rate": 6.244509516837481e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6782061457633972, + "num_tokens": 44036907.0, + "step": 1707 + }, + { + "epoch": 0.18756863606413354, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.1087722778320312, + "learning_rate": 6.248169838945828e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6799578666687012, + "num_tokens": 44067574.0, + "step": 1708 + }, + { + "epoch": 0.1876784537667472, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.4310595989227295, + "learning_rate": 6.251830161054172e-07, + "loss": 1.1105, + "mean_token_accuracy": 0.6739690899848938, + "num_tokens": 44092129.0, + "step": 1709 + }, + { + "epoch": 0.18778827146936086, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 2.18011212348938, + "learning_rate": 6.255490483162517e-07, + "loss": 1.1318, + "mean_token_accuracy": 0.6731592416763306, + "num_tokens": 44124520.0, + "step": 1710 + }, + { + "epoch": 0.18789808917197454, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.403228998184204, + "learning_rate": 6.259150805270863e-07, + "loss": 1.0821, + "mean_token_accuracy": 0.6899579167366028, + "num_tokens": 44150189.0, + "step": 1711 + }, + { + "epoch": 0.18800790687458818, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.595953941345215, + "learning_rate": 6.262811127379209e-07, + "loss": 0.9949, + "mean_token_accuracy": 0.7040380835533142, + "num_tokens": 44171622.0, + "step": 1712 + }, + { + "epoch": 0.18811772457720186, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.454425096511841, + "learning_rate": 6.266471449487554e-07, + "loss": 1.0853, + "mean_token_accuracy": 0.6914651989936829, + "num_tokens": 44196722.0, + "step": 1713 + }, + { + "epoch": 0.1882275422798155, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.3089752197265625, + "learning_rate": 6.2701317715959e-07, + "loss": 1.1233, + "mean_token_accuracy": 0.6695247888565063, + "num_tokens": 44228788.0, + "step": 1714 + }, + { + "epoch": 0.18833735998242918, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.6352086067199707, + "learning_rate": 6.273792093704245e-07, + "loss": 1.0699, + "mean_token_accuracy": 0.6823630332946777, + "num_tokens": 44253946.0, + "step": 1715 + }, + { + "epoch": 0.18844717768504282, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.498068332672119, + "learning_rate": 6.27745241581259e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.6819896101951599, + "num_tokens": 44277666.0, + "step": 1716 + }, + { + "epoch": 0.1885569953876565, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.1272783279418945, + "learning_rate": 6.281112737920937e-07, + "loss": 1.0055, + "mean_token_accuracy": 0.7001588344573975, + "num_tokens": 44306333.0, + "step": 1717 + }, + { + "epoch": 0.18866681309027014, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.302828073501587, + "learning_rate": 6.284773060029282e-07, + "loss": 1.0904, + "mean_token_accuracy": 0.6762185096740723, + "num_tokens": 44333933.0, + "step": 1718 + }, + { + "epoch": 0.18877663079288381, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.461214303970337, + "learning_rate": 6.288433382137627e-07, + "loss": 1.0041, + "mean_token_accuracy": 0.696891188621521, + "num_tokens": 44358642.0, + "step": 1719 + }, + { + "epoch": 0.18888644849549746, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.6027591228485107, + "learning_rate": 6.292093704245973e-07, + "loss": 1.0368, + "mean_token_accuracy": 0.6876036524772644, + "num_tokens": 44381083.0, + "step": 1720 + }, + { + "epoch": 0.18899626619811113, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.406088352203369, + "learning_rate": 6.295754026354318e-07, + "loss": 1.088, + "mean_token_accuracy": 0.6834394931793213, + "num_tokens": 44406847.0, + "step": 1721 + }, + { + "epoch": 0.1891060839007248, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.347323179244995, + "learning_rate": 6.299414348462665e-07, + "loss": 1.0745, + "mean_token_accuracy": 0.6771430969238281, + "num_tokens": 44433809.0, + "step": 1722 + }, + { + "epoch": 0.18921590160333845, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.545116662979126, + "learning_rate": 6.30307467057101e-07, + "loss": 1.0653, + "mean_token_accuracy": 0.6835930347442627, + "num_tokens": 44458042.0, + "step": 1723 + }, + { + "epoch": 0.18932571930595213, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.9065706729888916, + "learning_rate": 6.306734992679355e-07, + "loss": 1.0177, + "mean_token_accuracy": 0.6969931721687317, + "num_tokens": 44478035.0, + "step": 1724 + }, + { + "epoch": 0.18943553700856577, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.1805436611175537, + "learning_rate": 6.310395314787701e-07, + "loss": 1.1215, + "mean_token_accuracy": 0.6658453941345215, + "num_tokens": 44513865.0, + "step": 1725 + }, + { + "epoch": 0.18954535471117945, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.7567341327667236, + "learning_rate": 6.314055636896046e-07, + "loss": 1.0413, + "mean_token_accuracy": 0.697422444820404, + "num_tokens": 44534733.0, + "step": 1726 + }, + { + "epoch": 0.1896551724137931, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.325474262237549, + "learning_rate": 6.317715959004391e-07, + "loss": 0.9925, + "mean_token_accuracy": 0.7020057439804077, + "num_tokens": 44560374.0, + "step": 1727 + }, + { + "epoch": 0.18976499011640677, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.8065505027770996, + "learning_rate": 6.321376281112738e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.708989679813385, + "num_tokens": 44579550.0, + "step": 1728 + }, + { + "epoch": 0.1898748078190204, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.3169243335723877, + "learning_rate": 6.325036603221083e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6803832650184631, + "num_tokens": 44605935.0, + "step": 1729 + }, + { + "epoch": 0.1899846255216341, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.6459128856658936, + "learning_rate": 6.328696925329428e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.6945544481277466, + "num_tokens": 44628919.0, + "step": 1730 + }, + { + "epoch": 0.19009444322424776, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 2.345259666442871, + "learning_rate": 6.332357247437774e-07, + "loss": 1.0643, + "mean_token_accuracy": 0.681179404258728, + "num_tokens": 44656105.0, + "step": 1731 + }, + { + "epoch": 0.1902042609268614, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.2310543060302734, + "learning_rate": 6.336017569546119e-07, + "loss": 1.072, + "mean_token_accuracy": 0.6782453060150146, + "num_tokens": 44686900.0, + "step": 1732 + }, + { + "epoch": 0.19031407862947508, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.6609108448028564, + "learning_rate": 6.339677891654465e-07, + "loss": 1.0169, + "mean_token_accuracy": 0.6976384520530701, + "num_tokens": 44710054.0, + "step": 1733 + }, + { + "epoch": 0.19042389633208873, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.757654905319214, + "learning_rate": 6.343338213762811e-07, + "loss": 0.9999, + "mean_token_accuracy": 0.704803466796875, + "num_tokens": 44730033.0, + "step": 1734 + }, + { + "epoch": 0.1905337140347024, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.427304744720459, + "learning_rate": 6.346998535871156e-07, + "loss": 0.9656, + "mean_token_accuracy": 0.7071878910064697, + "num_tokens": 44756831.0, + "step": 1735 + }, + { + "epoch": 0.19064353173731605, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.3147528171539307, + "learning_rate": 6.350658857979501e-07, + "loss": 1.0449, + "mean_token_accuracy": 0.6877211928367615, + "num_tokens": 44782982.0, + "step": 1736 + }, + { + "epoch": 0.19075334943992972, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.342067241668701, + "learning_rate": 6.354319180087847e-07, + "loss": 1.1672, + "mean_token_accuracy": 0.6655222773551941, + "num_tokens": 44811212.0, + "step": 1737 + }, + { + "epoch": 0.19086316714254337, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.536372184753418, + "learning_rate": 6.357979502196193e-07, + "loss": 1.0198, + "mean_token_accuracy": 0.6987351179122925, + "num_tokens": 44833233.0, + "step": 1738 + }, + { + "epoch": 0.19097298484515704, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.1564126014709473, + "learning_rate": 6.361639824304539e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6874511241912842, + "num_tokens": 44861753.0, + "step": 1739 + }, + { + "epoch": 0.1910828025477707, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.249621629714966, + "learning_rate": 6.365300146412884e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6978278160095215, + "num_tokens": 44887824.0, + "step": 1740 + }, + { + "epoch": 0.19119262025038436, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.3816680908203125, + "learning_rate": 6.368960468521229e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6845417022705078, + "num_tokens": 44914371.0, + "step": 1741 + }, + { + "epoch": 0.19130243795299803, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.190480947494507, + "learning_rate": 6.372620790629575e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.673884928226471, + "num_tokens": 44943379.0, + "step": 1742 + }, + { + "epoch": 0.19141225565561168, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.5264415740966797, + "learning_rate": 6.37628111273792e-07, + "loss": 1.1202, + "mean_token_accuracy": 0.6797521114349365, + "num_tokens": 44967204.0, + "step": 1743 + }, + { + "epoch": 0.19152207335822535, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.2578840255737305, + "learning_rate": 6.379941434846266e-07, + "loss": 1.004, + "mean_token_accuracy": 0.6980558633804321, + "num_tokens": 44993218.0, + "step": 1744 + }, + { + "epoch": 0.191631891060839, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.451406955718994, + "learning_rate": 6.383601756954612e-07, + "loss": 1.0262, + "mean_token_accuracy": 0.6933801174163818, + "num_tokens": 45016771.0, + "step": 1745 + }, + { + "epoch": 0.19174170876345267, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 2.4585447311401367, + "learning_rate": 6.387262079062957e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6843966245651245, + "num_tokens": 45041696.0, + "step": 1746 + }, + { + "epoch": 0.19185152646606632, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.3610661029815674, + "learning_rate": 6.390922401171302e-07, + "loss": 1.0803, + "mean_token_accuracy": 0.6842159032821655, + "num_tokens": 45067615.0, + "step": 1747 + }, + { + "epoch": 0.19196134416868, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.501168966293335, + "learning_rate": 6.394582723279648e-07, + "loss": 1.0796, + "mean_token_accuracy": 0.6916075944900513, + "num_tokens": 45092065.0, + "step": 1748 + }, + { + "epoch": 0.19207116187129367, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.4881985187530518, + "learning_rate": 6.398243045387994e-07, + "loss": 1.0972, + "mean_token_accuracy": 0.6814267635345459, + "num_tokens": 45116555.0, + "step": 1749 + }, + { + "epoch": 0.1921809795739073, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.3255419731140137, + "learning_rate": 6.401903367496339e-07, + "loss": 1.1061, + "mean_token_accuracy": 0.6793285608291626, + "num_tokens": 45142968.0, + "step": 1750 + }, + { + "epoch": 0.19229079727652099, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.702047109603882, + "learning_rate": 6.405563689604685e-07, + "loss": 0.9916, + "mean_token_accuracy": 0.7005035281181335, + "num_tokens": 45162421.0, + "step": 1751 + }, + { + "epoch": 0.19240061497913463, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.3640406131744385, + "learning_rate": 6.40922401171303e-07, + "loss": 0.9621, + "mean_token_accuracy": 0.7099255919456482, + "num_tokens": 45188853.0, + "step": 1752 + }, + { + "epoch": 0.1925104326817483, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.4299426078796387, + "learning_rate": 6.412884333821375e-07, + "loss": 1.1018, + "mean_token_accuracy": 0.6952379941940308, + "num_tokens": 45211908.0, + "step": 1753 + }, + { + "epoch": 0.19262025038436195, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.377298355102539, + "learning_rate": 6.416544655929722e-07, + "loss": 0.9632, + "mean_token_accuracy": 0.7118997573852539, + "num_tokens": 45235897.0, + "step": 1754 + }, + { + "epoch": 0.19273006808697563, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.2585136890411377, + "learning_rate": 6.420204978038067e-07, + "loss": 1.2008, + "mean_token_accuracy": 0.6531033515930176, + "num_tokens": 45266668.0, + "step": 1755 + }, + { + "epoch": 0.19283988578958927, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.406116247177124, + "learning_rate": 6.423865300146413e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6733898520469666, + "num_tokens": 45291961.0, + "step": 1756 + }, + { + "epoch": 0.19294970349220295, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.226972818374634, + "learning_rate": 6.427525622254758e-07, + "loss": 1.0611, + "mean_token_accuracy": 0.6887009143829346, + "num_tokens": 45321224.0, + "step": 1757 + }, + { + "epoch": 0.1930595211948166, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.316452980041504, + "learning_rate": 6.431185944363103e-07, + "loss": 1.0525, + "mean_token_accuracy": 0.6925338506698608, + "num_tokens": 45349120.0, + "step": 1758 + }, + { + "epoch": 0.19316933889743026, + "ewc_loss": 4.470348358154297e-06, + "grad_norm": 2.504058837890625, + "learning_rate": 6.434846266471449e-07, + "loss": 0.9715, + "mean_token_accuracy": 0.7083194255828857, + "num_tokens": 45370609.0, + "step": 1759 + }, + { + "epoch": 0.19327915660004394, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.094726085662842, + "learning_rate": 6.438506588579795e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6949878931045532, + "num_tokens": 45402004.0, + "step": 1760 + }, + { + "epoch": 0.19338897430265758, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.632014036178589, + "learning_rate": 6.44216691068814e-07, + "loss": 0.9816, + "mean_token_accuracy": 0.7068601250648499, + "num_tokens": 45423421.0, + "step": 1761 + }, + { + "epoch": 0.19349879200527126, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.2658681869506836, + "learning_rate": 6.445827232796486e-07, + "loss": 0.987, + "mean_token_accuracy": 0.7089519500732422, + "num_tokens": 45449405.0, + "step": 1762 + }, + { + "epoch": 0.1936086097078849, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.614748239517212, + "learning_rate": 6.449487554904831e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.6933852434158325, + "num_tokens": 45470264.0, + "step": 1763 + }, + { + "epoch": 0.19371842741049858, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.1217308044433594, + "learning_rate": 6.453147877013176e-07, + "loss": 0.9952, + "mean_token_accuracy": 0.7089306116104126, + "num_tokens": 45501935.0, + "step": 1764 + }, + { + "epoch": 0.19382824511311222, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.348801612854004, + "learning_rate": 6.456808199121523e-07, + "loss": 1.0913, + "mean_token_accuracy": 0.6806987524032593, + "num_tokens": 45528933.0, + "step": 1765 + }, + { + "epoch": 0.1939380628157259, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.435859441757202, + "learning_rate": 6.460468521229868e-07, + "loss": 0.9513, + "mean_token_accuracy": 0.7128897905349731, + "num_tokens": 45551417.0, + "step": 1766 + }, + { + "epoch": 0.19404788051833954, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.1126153469085693, + "learning_rate": 6.464128843338213e-07, + "loss": 1.0444, + "mean_token_accuracy": 0.692302942276001, + "num_tokens": 45582175.0, + "step": 1767 + }, + { + "epoch": 0.19415769822095322, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.465967893600464, + "learning_rate": 6.467789165446559e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6717162132263184, + "num_tokens": 45606661.0, + "step": 1768 + }, + { + "epoch": 0.1942675159235669, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.4826650619506836, + "learning_rate": 6.471449487554904e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6811907887458801, + "num_tokens": 45632245.0, + "step": 1769 + }, + { + "epoch": 0.19437733362618054, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 2.060842752456665, + "learning_rate": 6.475109809663251e-07, + "loss": 1.0892, + "mean_token_accuracy": 0.6825469136238098, + "num_tokens": 45666175.0, + "step": 1770 + }, + { + "epoch": 0.1944871513287942, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.3920795917510986, + "learning_rate": 6.478770131771596e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6969884634017944, + "num_tokens": 45692819.0, + "step": 1771 + }, + { + "epoch": 0.19459696903140786, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.165034294128418, + "learning_rate": 6.482430453879941e-07, + "loss": 1.0507, + "mean_token_accuracy": 0.6938786506652832, + "num_tokens": 45721537.0, + "step": 1772 + }, + { + "epoch": 0.19470678673402153, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.613966703414917, + "learning_rate": 6.486090775988287e-07, + "loss": 1.0912, + "mean_token_accuracy": 0.6864945888519287, + "num_tokens": 45745690.0, + "step": 1773 + }, + { + "epoch": 0.19481660443663518, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.447493314743042, + "learning_rate": 6.489751098096632e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6743407249450684, + "num_tokens": 45771981.0, + "step": 1774 + }, + { + "epoch": 0.19492642213924885, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.531022548675537, + "learning_rate": 6.493411420204977e-07, + "loss": 1.0484, + "mean_token_accuracy": 0.6938334107398987, + "num_tokens": 45796644.0, + "step": 1775 + }, + { + "epoch": 0.1950362398418625, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 3.0221736431121826, + "learning_rate": 6.497071742313324e-07, + "loss": 1.0877, + "mean_token_accuracy": 0.6766634583473206, + "num_tokens": 45815886.0, + "step": 1776 + }, + { + "epoch": 0.19514605754447617, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.3089451789855957, + "learning_rate": 6.500732064421669e-07, + "loss": 1.0612, + "mean_token_accuracy": 0.6873929500579834, + "num_tokens": 45843574.0, + "step": 1777 + }, + { + "epoch": 0.19525587524708984, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.2980334758758545, + "learning_rate": 6.504392386530014e-07, + "loss": 1.028, + "mean_token_accuracy": 0.6973247528076172, + "num_tokens": 45870615.0, + "step": 1778 + }, + { + "epoch": 0.1953656929497035, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.5283167362213135, + "learning_rate": 6.50805270863836e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6952298879623413, + "num_tokens": 45892574.0, + "step": 1779 + }, + { + "epoch": 0.19547551065231716, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.490119457244873, + "learning_rate": 6.511713030746705e-07, + "loss": 1.1051, + "mean_token_accuracy": 0.6756008863449097, + "num_tokens": 45917428.0, + "step": 1780 + }, + { + "epoch": 0.1955853283549308, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.1594271659851074, + "learning_rate": 6.515373352855051e-07, + "loss": 1.0307, + "mean_token_accuracy": 0.6901370286941528, + "num_tokens": 45947191.0, + "step": 1781 + }, + { + "epoch": 0.19569514605754448, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.308586359024048, + "learning_rate": 6.519033674963397e-07, + "loss": 1.0072, + "mean_token_accuracy": 0.6998704075813293, + "num_tokens": 45973562.0, + "step": 1782 + }, + { + "epoch": 0.19580496376015813, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.4027395248413086, + "learning_rate": 6.522693997071742e-07, + "loss": 1.015, + "mean_token_accuracy": 0.7014646530151367, + "num_tokens": 46000779.0, + "step": 1783 + }, + { + "epoch": 0.1959147814627718, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 2.390467643737793, + "learning_rate": 6.526354319180087e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6925391554832458, + "num_tokens": 46025924.0, + "step": 1784 + }, + { + "epoch": 0.19602459916538545, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.3515589237213135, + "learning_rate": 6.530014641288433e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.6854251623153687, + "num_tokens": 46052335.0, + "step": 1785 + }, + { + "epoch": 0.19613441686799912, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.510554552078247, + "learning_rate": 6.533674963396779e-07, + "loss": 0.9831, + "mean_token_accuracy": 0.7064592838287354, + "num_tokens": 46075495.0, + "step": 1786 + }, + { + "epoch": 0.1962442345706128, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.745441198348999, + "learning_rate": 6.537335285505125e-07, + "loss": 1.0033, + "mean_token_accuracy": 0.6960027813911438, + "num_tokens": 46094772.0, + "step": 1787 + }, + { + "epoch": 0.19635405227322644, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.386396884918213, + "learning_rate": 6.54099560761347e-07, + "loss": 1.1107, + "mean_token_accuracy": 0.6731115579605103, + "num_tokens": 46123044.0, + "step": 1788 + }, + { + "epoch": 0.19646386997584012, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 2.4088122844696045, + "learning_rate": 6.544655929721815e-07, + "loss": 1.0448, + "mean_token_accuracy": 0.6920719146728516, + "num_tokens": 46146887.0, + "step": 1789 + }, + { + "epoch": 0.19657368767845376, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.5831356048583984, + "learning_rate": 6.548316251830161e-07, + "loss": 1.0377, + "mean_token_accuracy": 0.698712944984436, + "num_tokens": 46168260.0, + "step": 1790 + }, + { + "epoch": 0.19668350538106744, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.557170867919922, + "learning_rate": 6.551976573938506e-07, + "loss": 1.096, + "mean_token_accuracy": 0.6848652958869934, + "num_tokens": 46190266.0, + "step": 1791 + }, + { + "epoch": 0.19679332308368108, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.4029340744018555, + "learning_rate": 6.555636896046852e-07, + "loss": 1.0638, + "mean_token_accuracy": 0.686232328414917, + "num_tokens": 46214139.0, + "step": 1792 + }, + { + "epoch": 0.19690314078629476, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.72377347946167, + "learning_rate": 6.559297218155198e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6906066536903381, + "num_tokens": 46235396.0, + "step": 1793 + }, + { + "epoch": 0.1970129584889084, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.285862922668457, + "learning_rate": 6.562957540263543e-07, + "loss": 1.0279, + "mean_token_accuracy": 0.6942108869552612, + "num_tokens": 46261539.0, + "step": 1794 + }, + { + "epoch": 0.19712277619152208, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.4062917232513428, + "learning_rate": 6.566617862371888e-07, + "loss": 1.0316, + "mean_token_accuracy": 0.6950410604476929, + "num_tokens": 46286833.0, + "step": 1795 + }, + { + "epoch": 0.19723259389413572, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.3257343769073486, + "learning_rate": 6.570278184480234e-07, + "loss": 1.0775, + "mean_token_accuracy": 0.6827669143676758, + "num_tokens": 46313733.0, + "step": 1796 + }, + { + "epoch": 0.1973424115967494, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.4144704341888428, + "learning_rate": 6.57393850658858e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6835861206054688, + "num_tokens": 46337087.0, + "step": 1797 + }, + { + "epoch": 0.19745222929936307, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.8620290756225586, + "learning_rate": 6.577598828696925e-07, + "loss": 0.9768, + "mean_token_accuracy": 0.7018822431564331, + "num_tokens": 46356648.0, + "step": 1798 + }, + { + "epoch": 0.19756204700197671, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.4473721981048584, + "learning_rate": 6.581259150805271e-07, + "loss": 0.947, + "mean_token_accuracy": 0.7118923664093018, + "num_tokens": 46378939.0, + "step": 1799 + }, + { + "epoch": 0.1976718647045904, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.2929654121398926, + "learning_rate": 6.584919472913616e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.685431957244873, + "num_tokens": 46408171.0, + "step": 1800 + }, + { + "epoch": 0.19778168240720403, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.6046407222747803, + "learning_rate": 6.588579795021961e-07, + "loss": 1.0067, + "mean_token_accuracy": 0.7018272280693054, + "num_tokens": 46431563.0, + "step": 1801 + }, + { + "epoch": 0.1978915001098177, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.511587619781494, + "learning_rate": 6.592240117130308e-07, + "loss": 1.0356, + "mean_token_accuracy": 0.7014815807342529, + "num_tokens": 46455897.0, + "step": 1802 + }, + { + "epoch": 0.19800131781243135, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.2356674671173096, + "learning_rate": 6.595900439238653e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.674598217010498, + "num_tokens": 46485421.0, + "step": 1803 + }, + { + "epoch": 0.19811113551504503, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 2.818441152572632, + "learning_rate": 6.599560761346999e-07, + "loss": 1.0099, + "mean_token_accuracy": 0.6975648999214172, + "num_tokens": 46504392.0, + "step": 1804 + }, + { + "epoch": 0.19822095321765867, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.5934877395629883, + "learning_rate": 6.603221083455344e-07, + "loss": 0.9595, + "mean_token_accuracy": 0.7140854597091675, + "num_tokens": 46526987.0, + "step": 1805 + }, + { + "epoch": 0.19833077092027235, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.4541728496551514, + "learning_rate": 6.606881405563689e-07, + "loss": 1.0237, + "mean_token_accuracy": 0.697242259979248, + "num_tokens": 46552167.0, + "step": 1806 + }, + { + "epoch": 0.19844058862288602, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.500547170639038, + "learning_rate": 6.610541727672035e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.685738742351532, + "num_tokens": 46578431.0, + "step": 1807 + }, + { + "epoch": 0.19855040632549967, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.550865888595581, + "learning_rate": 6.614202049780381e-07, + "loss": 0.9255, + "mean_token_accuracy": 0.7175000309944153, + "num_tokens": 46597882.0, + "step": 1808 + }, + { + "epoch": 0.19866022402811334, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.181548833847046, + "learning_rate": 6.617862371888726e-07, + "loss": 1.0999, + "mean_token_accuracy": 0.690569281578064, + "num_tokens": 46627173.0, + "step": 1809 + }, + { + "epoch": 0.198770041730727, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.4864320755004883, + "learning_rate": 6.621522693997072e-07, + "loss": 1.0555, + "mean_token_accuracy": 0.6932936906814575, + "num_tokens": 46652241.0, + "step": 1810 + }, + { + "epoch": 0.19887985943334066, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.3605544567108154, + "learning_rate": 6.625183016105417e-07, + "loss": 1.0979, + "mean_token_accuracy": 0.6748313903808594, + "num_tokens": 46681172.0, + "step": 1811 + }, + { + "epoch": 0.1989896771359543, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.2396581172943115, + "learning_rate": 6.628843338213762e-07, + "loss": 1.0806, + "mean_token_accuracy": 0.6822333335876465, + "num_tokens": 46711892.0, + "step": 1812 + }, + { + "epoch": 0.19909949483856798, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 2.1639246940612793, + "learning_rate": 6.632503660322109e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6797014474868774, + "num_tokens": 46744839.0, + "step": 1813 + }, + { + "epoch": 0.19920931254118163, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.4182374477386475, + "learning_rate": 6.636163982430454e-07, + "loss": 0.9465, + "mean_token_accuracy": 0.7040873765945435, + "num_tokens": 46768706.0, + "step": 1814 + }, + { + "epoch": 0.1993191302437953, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.736818552017212, + "learning_rate": 6.639824304538799e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.7003761529922485, + "num_tokens": 46789029.0, + "step": 1815 + }, + { + "epoch": 0.19942894794640897, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.1854426860809326, + "learning_rate": 6.643484626647145e-07, + "loss": 1.0818, + "mean_token_accuracy": 0.6806513071060181, + "num_tokens": 46819385.0, + "step": 1816 + }, + { + "epoch": 0.19953876564902262, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.330559015274048, + "learning_rate": 6.64714494875549e-07, + "loss": 1.0635, + "mean_token_accuracy": 0.6841527223587036, + "num_tokens": 46845617.0, + "step": 1817 + }, + { + "epoch": 0.1996485833516363, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.547067165374756, + "learning_rate": 6.650805270863836e-07, + "loss": 0.9904, + "mean_token_accuracy": 0.701797604560852, + "num_tokens": 46867168.0, + "step": 1818 + }, + { + "epoch": 0.19975840105424994, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 2.5584940910339355, + "learning_rate": 6.654465592972182e-07, + "loss": 0.9915, + "mean_token_accuracy": 0.7052910327911377, + "num_tokens": 46893199.0, + "step": 1819 + }, + { + "epoch": 0.1998682187568636, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 3.367859363555908, + "learning_rate": 6.658125915080527e-07, + "loss": 1.0311, + "mean_token_accuracy": 0.6950178146362305, + "num_tokens": 46912484.0, + "step": 1820 + }, + { + "epoch": 0.19997803645947726, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.095919132232666, + "learning_rate": 6.661786237188873e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6842160820960999, + "num_tokens": 46949071.0, + "step": 1821 + }, + { + "epoch": 0.20008785416209093, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 2.3411173820495605, + "learning_rate": 6.665446559297218e-07, + "loss": 1.0015, + "mean_token_accuracy": 0.6965291500091553, + "num_tokens": 46976660.0, + "step": 1822 + }, + { + "epoch": 0.20019767186470458, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.3435933589935303, + "learning_rate": 6.669106881405563e-07, + "loss": 0.9248, + "mean_token_accuracy": 0.7260533571243286, + "num_tokens": 47003264.0, + "step": 1823 + }, + { + "epoch": 0.20030748956731825, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.024103879928589, + "learning_rate": 6.67276720351391e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6861734986305237, + "num_tokens": 47036317.0, + "step": 1824 + }, + { + "epoch": 0.2004173072699319, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.502115488052368, + "learning_rate": 6.676427525622255e-07, + "loss": 1.0728, + "mean_token_accuracy": 0.6849780082702637, + "num_tokens": 47059959.0, + "step": 1825 + }, + { + "epoch": 0.20052712497254557, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.506535768508911, + "learning_rate": 6.6800878477306e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.6955149173736572, + "num_tokens": 47083996.0, + "step": 1826 + }, + { + "epoch": 0.20063694267515925, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.2396817207336426, + "learning_rate": 6.683748169838946e-07, + "loss": 1.0255, + "mean_token_accuracy": 0.716917872428894, + "num_tokens": 47111746.0, + "step": 1827 + }, + { + "epoch": 0.2007467603777729, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.331427812576294, + "learning_rate": 6.687408491947291e-07, + "loss": 1.1006, + "mean_token_accuracy": 0.6844683289527893, + "num_tokens": 47137621.0, + "step": 1828 + }, + { + "epoch": 0.20085657808038657, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.440610408782959, + "learning_rate": 6.691068814055637e-07, + "loss": 0.9998, + "mean_token_accuracy": 0.6987555623054504, + "num_tokens": 47162400.0, + "step": 1829 + }, + { + "epoch": 0.2009663957830002, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 2.6553797721862793, + "learning_rate": 6.694729136163983e-07, + "loss": 0.997, + "mean_token_accuracy": 0.7017475366592407, + "num_tokens": 47186479.0, + "step": 1830 + }, + { + "epoch": 0.20107621348561389, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.080942392349243, + "learning_rate": 6.698389458272328e-07, + "loss": 1.0916, + "mean_token_accuracy": 0.6876802444458008, + "num_tokens": 47217322.0, + "step": 1831 + }, + { + "epoch": 0.20118603118822753, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.0948405265808105, + "learning_rate": 6.702049780380673e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.6881229281425476, + "num_tokens": 47247815.0, + "step": 1832 + }, + { + "epoch": 0.2012958488908412, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.428718090057373, + "learning_rate": 6.705710102489019e-07, + "loss": 1.0258, + "mean_token_accuracy": 0.6954665780067444, + "num_tokens": 47274328.0, + "step": 1833 + }, + { + "epoch": 0.20140566659345485, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.211021661758423, + "learning_rate": 6.709370424597365e-07, + "loss": 1.0424, + "mean_token_accuracy": 0.6860976815223694, + "num_tokens": 47303590.0, + "step": 1834 + }, + { + "epoch": 0.20151548429606853, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.266415596008301, + "learning_rate": 6.71303074670571e-07, + "loss": 0.9089, + "mean_token_accuracy": 0.729573130607605, + "num_tokens": 47331291.0, + "step": 1835 + }, + { + "epoch": 0.2016253019986822, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.410863161087036, + "learning_rate": 6.716691068814056e-07, + "loss": 1.0203, + "mean_token_accuracy": 0.6954379081726074, + "num_tokens": 47356358.0, + "step": 1836 + }, + { + "epoch": 0.20173511970129585, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.3050918579101562, + "learning_rate": 6.720351390922401e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.6994857788085938, + "num_tokens": 47383864.0, + "step": 1837 + }, + { + "epoch": 0.20184493740390952, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.394681215286255, + "learning_rate": 6.724011713030747e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.6907350420951843, + "num_tokens": 47408405.0, + "step": 1838 + }, + { + "epoch": 0.20195475510652316, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.4110724925994873, + "learning_rate": 6.727672035139092e-07, + "loss": 1.0104, + "mean_token_accuracy": 0.7022641897201538, + "num_tokens": 47432899.0, + "step": 1839 + }, + { + "epoch": 0.20206457280913684, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.423851728439331, + "learning_rate": 6.731332357247438e-07, + "loss": 1.1367, + "mean_token_accuracy": 0.6700567007064819, + "num_tokens": 47459200.0, + "step": 1840 + }, + { + "epoch": 0.20217439051175048, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.171600341796875, + "learning_rate": 6.734992679355784e-07, + "loss": 1.128, + "mean_token_accuracy": 0.6719978451728821, + "num_tokens": 47489755.0, + "step": 1841 + }, + { + "epoch": 0.20228420821436416, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.067911386489868, + "learning_rate": 6.738653001464129e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.687575101852417, + "num_tokens": 47521733.0, + "step": 1842 + }, + { + "epoch": 0.2023940259169778, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.6382758617401123, + "learning_rate": 6.742313323572474e-07, + "loss": 0.9663, + "mean_token_accuracy": 0.7115259170532227, + "num_tokens": 47541998.0, + "step": 1843 + }, + { + "epoch": 0.20250384361959148, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 2.222445011138916, + "learning_rate": 6.74597364568082e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.687380313873291, + "num_tokens": 47569865.0, + "step": 1844 + }, + { + "epoch": 0.20261366132220515, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.3647918701171875, + "learning_rate": 6.749633967789166e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6735361218452454, + "num_tokens": 47598795.0, + "step": 1845 + }, + { + "epoch": 0.2027234790248188, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.2519474029541016, + "learning_rate": 6.753294289897511e-07, + "loss": 1.1282, + "mean_token_accuracy": 0.6711903214454651, + "num_tokens": 47627078.0, + "step": 1846 + }, + { + "epoch": 0.20283329672743247, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.0822086334228516, + "learning_rate": 6.756954612005857e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.695512056350708, + "num_tokens": 47660542.0, + "step": 1847 + }, + { + "epoch": 0.20294311443004612, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.4189531803131104, + "learning_rate": 6.760614934114202e-07, + "loss": 1.0934, + "mean_token_accuracy": 0.6792166829109192, + "num_tokens": 47687250.0, + "step": 1848 + }, + { + "epoch": 0.2030529321326598, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.6304843425750732, + "learning_rate": 6.764275256222546e-07, + "loss": 0.9899, + "mean_token_accuracy": 0.6957434415817261, + "num_tokens": 47708672.0, + "step": 1849 + }, + { + "epoch": 0.20316274983527344, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.626511812210083, + "learning_rate": 6.767935578330894e-07, + "loss": 0.9712, + "mean_token_accuracy": 0.7129931449890137, + "num_tokens": 47730052.0, + "step": 1850 + }, + { + "epoch": 0.2032725675378871, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.472318172454834, + "learning_rate": 6.771595900439239e-07, + "loss": 1.1075, + "mean_token_accuracy": 0.67388916015625, + "num_tokens": 47757372.0, + "step": 1851 + }, + { + "epoch": 0.20338238524050076, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.4795126914978027, + "learning_rate": 6.775256222547583e-07, + "loss": 1.0665, + "mean_token_accuracy": 0.6775414943695068, + "num_tokens": 47784543.0, + "step": 1852 + }, + { + "epoch": 0.20349220294311443, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 2.2407126426696777, + "learning_rate": 6.77891654465593e-07, + "loss": 1.0581, + "mean_token_accuracy": 0.6855281591415405, + "num_tokens": 47813373.0, + "step": 1853 + }, + { + "epoch": 0.2036020206457281, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.40417218208313, + "learning_rate": 6.782576866764274e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.6865139007568359, + "num_tokens": 47837686.0, + "step": 1854 + }, + { + "epoch": 0.20371183834834175, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.1293466091156006, + "learning_rate": 6.78623718887262e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6890091300010681, + "num_tokens": 47870281.0, + "step": 1855 + }, + { + "epoch": 0.20382165605095542, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.25299072265625, + "learning_rate": 6.789897510980966e-07, + "loss": 1.1003, + "mean_token_accuracy": 0.6734716892242432, + "num_tokens": 47899080.0, + "step": 1856 + }, + { + "epoch": 0.20393147375356907, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.342961311340332, + "learning_rate": 6.793557833089311e-07, + "loss": 1.0766, + "mean_token_accuracy": 0.6849681735038757, + "num_tokens": 47923965.0, + "step": 1857 + }, + { + "epoch": 0.20404129145618274, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 2.6515281200408936, + "learning_rate": 6.797218155197657e-07, + "loss": 1.006, + "mean_token_accuracy": 0.7001534104347229, + "num_tokens": 47945835.0, + "step": 1858 + }, + { + "epoch": 0.2041511091587964, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.4403440952301025, + "learning_rate": 6.800878477306002e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.6874161958694458, + "num_tokens": 47971101.0, + "step": 1859 + }, + { + "epoch": 0.20426092686141006, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.4459705352783203, + "learning_rate": 6.804538799414347e-07, + "loss": 1.102, + "mean_token_accuracy": 0.6924660205841064, + "num_tokens": 47995297.0, + "step": 1860 + }, + { + "epoch": 0.2043707445640237, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.1557984352111816, + "learning_rate": 6.808199121522694e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6781267523765564, + "num_tokens": 48028033.0, + "step": 1861 + }, + { + "epoch": 0.20448056226663738, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.5655243396759033, + "learning_rate": 6.811859443631039e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.6857144832611084, + "num_tokens": 48050471.0, + "step": 1862 + }, + { + "epoch": 0.20459037996925103, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.5418457984924316, + "learning_rate": 6.815519765739384e-07, + "loss": 1.0479, + "mean_token_accuracy": 0.6891350746154785, + "num_tokens": 48073222.0, + "step": 1863 + }, + { + "epoch": 0.2047001976718647, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.413133144378662, + "learning_rate": 6.81918008784773e-07, + "loss": 1.038, + "mean_token_accuracy": 0.6922634840011597, + "num_tokens": 48097610.0, + "step": 1864 + }, + { + "epoch": 0.20481001537447838, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.6518983840942383, + "learning_rate": 6.822840409956075e-07, + "loss": 1.0124, + "mean_token_accuracy": 0.701135516166687, + "num_tokens": 48119076.0, + "step": 1865 + }, + { + "epoch": 0.20491983307709202, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.311392307281494, + "learning_rate": 6.826500732064421e-07, + "loss": 1.001, + "mean_token_accuracy": 0.6991109848022461, + "num_tokens": 48145067.0, + "step": 1866 + }, + { + "epoch": 0.2050296507797057, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.3986408710479736, + "learning_rate": 6.830161054172767e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6772288084030151, + "num_tokens": 48169280.0, + "step": 1867 + }, + { + "epoch": 0.20513946848231934, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.1523196697235107, + "learning_rate": 6.833821376281112e-07, + "loss": 0.9764, + "mean_token_accuracy": 0.7043222188949585, + "num_tokens": 48201385.0, + "step": 1868 + }, + { + "epoch": 0.20524928618493302, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 2.6288034915924072, + "learning_rate": 6.837481698389457e-07, + "loss": 1.0113, + "mean_token_accuracy": 0.6956906318664551, + "num_tokens": 48227893.0, + "step": 1869 + }, + { + "epoch": 0.20535910388754666, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.4178810119628906, + "learning_rate": 6.841142020497803e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.6844275593757629, + "num_tokens": 48255573.0, + "step": 1870 + }, + { + "epoch": 0.20546892159016034, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.9918320178985596, + "learning_rate": 6.844802342606148e-07, + "loss": 0.9539, + "mean_token_accuracy": 0.7204498648643494, + "num_tokens": 48276990.0, + "step": 1871 + }, + { + "epoch": 0.20557873929277398, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.5544865131378174, + "learning_rate": 6.848462664714495e-07, + "loss": 1.0433, + "mean_token_accuracy": 0.6937178373336792, + "num_tokens": 48298559.0, + "step": 1872 + }, + { + "epoch": 0.20568855699538766, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.280296802520752, + "learning_rate": 6.85212298682284e-07, + "loss": 1.0871, + "mean_token_accuracy": 0.6743979454040527, + "num_tokens": 48325901.0, + "step": 1873 + }, + { + "epoch": 0.20579837469800133, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.6858670711517334, + "learning_rate": 6.855783308931185e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6839660406112671, + "num_tokens": 48347441.0, + "step": 1874 + }, + { + "epoch": 0.20590819240061498, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 3.2736010551452637, + "learning_rate": 6.859443631039531e-07, + "loss": 1.0431, + "mean_token_accuracy": 0.6927621364593506, + "num_tokens": 48365276.0, + "step": 1875 + }, + { + "epoch": 0.20601801010322865, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.64123797416687, + "learning_rate": 6.863103953147876e-07, + "loss": 0.95, + "mean_token_accuracy": 0.7146434783935547, + "num_tokens": 48385953.0, + "step": 1876 + }, + { + "epoch": 0.2061278278058423, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.444924831390381, + "learning_rate": 6.866764275256222e-07, + "loss": 1.0002, + "mean_token_accuracy": 0.7016901969909668, + "num_tokens": 48410901.0, + "step": 1877 + }, + { + "epoch": 0.20623764550845597, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.322554111480713, + "learning_rate": 6.870424597364568e-07, + "loss": 1.0941, + "mean_token_accuracy": 0.6793787479400635, + "num_tokens": 48438408.0, + "step": 1878 + }, + { + "epoch": 0.20634746321106961, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 2.8062045574188232, + "learning_rate": 6.874084919472913e-07, + "loss": 1.0112, + "mean_token_accuracy": 0.7013716697692871, + "num_tokens": 48457518.0, + "step": 1879 + }, + { + "epoch": 0.2064572809136833, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.288856029510498, + "learning_rate": 6.877745241581258e-07, + "loss": 0.9989, + "mean_token_accuracy": 0.7078457474708557, + "num_tokens": 48486392.0, + "step": 1880 + }, + { + "epoch": 0.20656709861629693, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.285010576248169, + "learning_rate": 6.881405563689604e-07, + "loss": 1.1349, + "mean_token_accuracy": 0.6767560839653015, + "num_tokens": 48516058.0, + "step": 1881 + }, + { + "epoch": 0.2066769163189106, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.2691962718963623, + "learning_rate": 6.88506588579795e-07, + "loss": 1.0862, + "mean_token_accuracy": 0.6775070428848267, + "num_tokens": 48546658.0, + "step": 1882 + }, + { + "epoch": 0.20678673402152428, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.6047801971435547, + "learning_rate": 6.888726207906295e-07, + "loss": 1.0696, + "mean_token_accuracy": 0.6808596849441528, + "num_tokens": 48569941.0, + "step": 1883 + }, + { + "epoch": 0.20689655172413793, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.6921088695526123, + "learning_rate": 6.892386530014641e-07, + "loss": 1.0112, + "mean_token_accuracy": 0.6985504031181335, + "num_tokens": 48590774.0, + "step": 1884 + }, + { + "epoch": 0.2070063694267516, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.2571663856506348, + "learning_rate": 6.896046852122986e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.679856538772583, + "num_tokens": 48617652.0, + "step": 1885 + }, + { + "epoch": 0.20711618712936525, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.372769594192505, + "learning_rate": 6.899707174231331e-07, + "loss": 1.019, + "mean_token_accuracy": 0.6904285550117493, + "num_tokens": 48642381.0, + "step": 1886 + }, + { + "epoch": 0.20722600483197892, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.263193130493164, + "learning_rate": 6.903367496339677e-07, + "loss": 1.0126, + "mean_token_accuracy": 0.7017054557800293, + "num_tokens": 48671585.0, + "step": 1887 + }, + { + "epoch": 0.20733582253459257, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.227269172668457, + "learning_rate": 6.907027818448023e-07, + "loss": 1.0812, + "mean_token_accuracy": 0.6793104410171509, + "num_tokens": 48703218.0, + "step": 1888 + }, + { + "epoch": 0.20744564023720624, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 2.236358165740967, + "learning_rate": 6.910688140556369e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.679069459438324, + "num_tokens": 48733166.0, + "step": 1889 + }, + { + "epoch": 0.2075554579398199, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.4917209148406982, + "learning_rate": 6.914348462664714e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6947040557861328, + "num_tokens": 48756462.0, + "step": 1890 + }, + { + "epoch": 0.20766527564243356, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.275174856185913, + "learning_rate": 6.918008784773059e-07, + "loss": 1.0944, + "mean_token_accuracy": 0.6889236569404602, + "num_tokens": 48785002.0, + "step": 1891 + }, + { + "epoch": 0.20777509334504723, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.2973411083221436, + "learning_rate": 6.921669106881405e-07, + "loss": 0.9858, + "mean_token_accuracy": 0.7081948518753052, + "num_tokens": 48810763.0, + "step": 1892 + }, + { + "epoch": 0.20788491104766088, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.435393810272217, + "learning_rate": 6.925329428989751e-07, + "loss": 1.0135, + "mean_token_accuracy": 0.6964699029922485, + "num_tokens": 48835009.0, + "step": 1893 + }, + { + "epoch": 0.20799472875027455, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.4895365238189697, + "learning_rate": 6.928989751098096e-07, + "loss": 1.0106, + "mean_token_accuracy": 0.6984033584594727, + "num_tokens": 48858156.0, + "step": 1894 + }, + { + "epoch": 0.2081045464528882, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.678339719772339, + "learning_rate": 6.932650073206442e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6802142858505249, + "num_tokens": 48882263.0, + "step": 1895 + }, + { + "epoch": 0.20821436415550187, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.6267666816711426, + "learning_rate": 6.936310395314787e-07, + "loss": 1.0695, + "mean_token_accuracy": 0.6861281991004944, + "num_tokens": 48905461.0, + "step": 1896 + }, + { + "epoch": 0.20832418185811552, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 2.410299777984619, + "learning_rate": 6.939970717423132e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.6894757747650146, + "num_tokens": 48931619.0, + "step": 1897 + }, + { + "epoch": 0.2084339995607292, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.4869191646575928, + "learning_rate": 6.943631039531479e-07, + "loss": 1.08, + "mean_token_accuracy": 0.6815085411071777, + "num_tokens": 48957444.0, + "step": 1898 + }, + { + "epoch": 0.20854381726334284, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.318964958190918, + "learning_rate": 6.947291361639824e-07, + "loss": 0.9794, + "mean_token_accuracy": 0.701862633228302, + "num_tokens": 48986415.0, + "step": 1899 + }, + { + "epoch": 0.2086536349659565, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 2.4940695762634277, + "learning_rate": 6.950951683748169e-07, + "loss": 1.0102, + "mean_token_accuracy": 0.6998608708381653, + "num_tokens": 49010061.0, + "step": 1900 + }, + { + "epoch": 0.20876345266857016, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.5282793045043945, + "learning_rate": 6.954612005856515e-07, + "loss": 0.9665, + "mean_token_accuracy": 0.7124642729759216, + "num_tokens": 49033376.0, + "step": 1901 + }, + { + "epoch": 0.20887327037118383, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.487903118133545, + "learning_rate": 6.95827232796486e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.6941099762916565, + "num_tokens": 49057563.0, + "step": 1902 + }, + { + "epoch": 0.2089830880737975, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.3596906661987305, + "learning_rate": 6.961932650073206e-07, + "loss": 1.1132, + "mean_token_accuracy": 0.6696068644523621, + "num_tokens": 49085030.0, + "step": 1903 + }, + { + "epoch": 0.20909290577641115, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.5795912742614746, + "learning_rate": 6.965592972181552e-07, + "loss": 0.9497, + "mean_token_accuracy": 0.7105512619018555, + "num_tokens": 49104353.0, + "step": 1904 + }, + { + "epoch": 0.20920272347902483, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.33705472946167, + "learning_rate": 6.969253294289897e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6949144601821899, + "num_tokens": 49134413.0, + "step": 1905 + }, + { + "epoch": 0.20931254118163847, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 2.2728772163391113, + "learning_rate": 6.972913616398243e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6907474994659424, + "num_tokens": 49164137.0, + "step": 1906 + }, + { + "epoch": 0.20942235888425215, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.4287185668945312, + "learning_rate": 6.976573938506588e-07, + "loss": 1.0373, + "mean_token_accuracy": 0.6883101463317871, + "num_tokens": 49192401.0, + "step": 1907 + }, + { + "epoch": 0.2095321765868658, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.3884732723236084, + "learning_rate": 6.980234260614933e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6911745071411133, + "num_tokens": 49218341.0, + "step": 1908 + }, + { + "epoch": 0.20964199428947947, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.6665616035461426, + "learning_rate": 6.98389458272328e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.6969563961029053, + "num_tokens": 49239163.0, + "step": 1909 + }, + { + "epoch": 0.2097518119920931, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.2571475505828857, + "learning_rate": 6.987554904831625e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6909555196762085, + "num_tokens": 49268705.0, + "step": 1910 + }, + { + "epoch": 0.20986162969470679, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.3965847492218018, + "learning_rate": 6.99121522693997e-07, + "loss": 0.9746, + "mean_token_accuracy": 0.7087337970733643, + "num_tokens": 49293437.0, + "step": 1911 + }, + { + "epoch": 0.20997144739732046, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.545743465423584, + "learning_rate": 6.994875549048316e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.6693698167800903, + "num_tokens": 49317640.0, + "step": 1912 + }, + { + "epoch": 0.2100812650999341, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.164421558380127, + "learning_rate": 6.998535871156661e-07, + "loss": 1.0495, + "mean_token_accuracy": 0.6909928321838379, + "num_tokens": 49348204.0, + "step": 1913 + }, + { + "epoch": 0.21019108280254778, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.3065009117126465, + "learning_rate": 7.002196193265007e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6990437507629395, + "num_tokens": 49376892.0, + "step": 1914 + }, + { + "epoch": 0.21030090050516143, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.253446578979492, + "learning_rate": 7.005856515373353e-07, + "loss": 1.0079, + "mean_token_accuracy": 0.6902735233306885, + "num_tokens": 49405788.0, + "step": 1915 + }, + { + "epoch": 0.2104107182077751, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.5097198486328125, + "learning_rate": 7.009516837481698e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.6846866607666016, + "num_tokens": 49428328.0, + "step": 1916 + }, + { + "epoch": 0.21052053591038875, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.255143642425537, + "learning_rate": 7.013177159590043e-07, + "loss": 1.086, + "mean_token_accuracy": 0.6906102895736694, + "num_tokens": 49456756.0, + "step": 1917 + }, + { + "epoch": 0.21063035361300242, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.4780936241149902, + "learning_rate": 7.016837481698389e-07, + "loss": 1.0372, + "mean_token_accuracy": 0.6935805678367615, + "num_tokens": 49481855.0, + "step": 1918 + }, + { + "epoch": 0.21074017131561606, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.50897216796875, + "learning_rate": 7.020497803806734e-07, + "loss": 1.0678, + "mean_token_accuracy": 0.6857726573944092, + "num_tokens": 49507527.0, + "step": 1919 + }, + { + "epoch": 0.21084998901822974, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.2785141468048096, + "learning_rate": 7.024158125915081e-07, + "loss": 1.0755, + "mean_token_accuracy": 0.6768909692764282, + "num_tokens": 49536120.0, + "step": 1920 + }, + { + "epoch": 0.2109598067208434, + "ewc_loss": 5.0961971282958984e-06, + "grad_norm": 2.5705184936523438, + "learning_rate": 7.027818448023426e-07, + "loss": 1.0078, + "mean_token_accuracy": 0.6986452341079712, + "num_tokens": 49557633.0, + "step": 1921 + }, + { + "epoch": 0.21106962442345706, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.4186840057373047, + "learning_rate": 7.031478770131771e-07, + "loss": 1.0036, + "mean_token_accuracy": 0.7029879689216614, + "num_tokens": 49581184.0, + "step": 1922 + }, + { + "epoch": 0.21117944212607073, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.223510265350342, + "learning_rate": 7.035139092240117e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6857003569602966, + "num_tokens": 49612774.0, + "step": 1923 + }, + { + "epoch": 0.21128925982868438, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.769429922103882, + "learning_rate": 7.038799414348462e-07, + "loss": 1.0033, + "mean_token_accuracy": 0.6992085576057434, + "num_tokens": 49632537.0, + "step": 1924 + }, + { + "epoch": 0.21139907753129805, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.148005247116089, + "learning_rate": 7.042459736456808e-07, + "loss": 0.9567, + "mean_token_accuracy": 0.7209749221801758, + "num_tokens": 49661533.0, + "step": 1925 + }, + { + "epoch": 0.2115088952339117, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.1562724113464355, + "learning_rate": 7.046120058565154e-07, + "loss": 1.0048, + "mean_token_accuracy": 0.6991261839866638, + "num_tokens": 49691168.0, + "step": 1926 + }, + { + "epoch": 0.21161871293652537, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.293722152709961, + "learning_rate": 7.049780380673499e-07, + "loss": 1.1391, + "mean_token_accuracy": 0.6707053184509277, + "num_tokens": 49718846.0, + "step": 1927 + }, + { + "epoch": 0.21172853063913902, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.873337984085083, + "learning_rate": 7.053440702781844e-07, + "loss": 1.0363, + "mean_token_accuracy": 0.6949337124824524, + "num_tokens": 49738425.0, + "step": 1928 + }, + { + "epoch": 0.2118383483417527, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.577627420425415, + "learning_rate": 7.05710102489019e-07, + "loss": 0.9828, + "mean_token_accuracy": 0.7075948715209961, + "num_tokens": 49762843.0, + "step": 1929 + }, + { + "epoch": 0.21194816604436637, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.2841360569000244, + "learning_rate": 7.060761346998536e-07, + "loss": 0.9302, + "mean_token_accuracy": 0.7247504591941833, + "num_tokens": 49787535.0, + "step": 1930 + }, + { + "epoch": 0.21205798374698, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.6493372917175293, + "learning_rate": 7.064421669106881e-07, + "loss": 0.9634, + "mean_token_accuracy": 0.7166110277175903, + "num_tokens": 49808954.0, + "step": 1931 + }, + { + "epoch": 0.21216780144959368, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.406294345855713, + "learning_rate": 7.068081991215227e-07, + "loss": 0.9736, + "mean_token_accuracy": 0.7137020230293274, + "num_tokens": 49833932.0, + "step": 1932 + }, + { + "epoch": 0.21227761915220733, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.327439546585083, + "learning_rate": 7.071742313323572e-07, + "loss": 1.0593, + "mean_token_accuracy": 0.6878560781478882, + "num_tokens": 49861910.0, + "step": 1933 + }, + { + "epoch": 0.212387436854821, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.5016651153564453, + "learning_rate": 7.075402635431917e-07, + "loss": 1.0641, + "mean_token_accuracy": 0.6869984865188599, + "num_tokens": 49885140.0, + "step": 1934 + }, + { + "epoch": 0.21249725455743465, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 2.3170859813690186, + "learning_rate": 7.079062957540263e-07, + "loss": 0.9589, + "mean_token_accuracy": 0.7129976749420166, + "num_tokens": 49909653.0, + "step": 1935 + }, + { + "epoch": 0.21260707226004832, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 8.54220199584961, + "learning_rate": 7.082723279648609e-07, + "loss": 1.0705, + "mean_token_accuracy": 0.6857879161834717, + "num_tokens": 49936584.0, + "step": 1936 + }, + { + "epoch": 0.21271688996266197, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.7138562202453613, + "learning_rate": 7.086383601756955e-07, + "loss": 1.034, + "mean_token_accuracy": 0.6881668567657471, + "num_tokens": 49956702.0, + "step": 1937 + }, + { + "epoch": 0.21282670766527564, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.61106276512146, + "learning_rate": 7.0900439238653e-07, + "loss": 0.9865, + "mean_token_accuracy": 0.6947301626205444, + "num_tokens": 49977896.0, + "step": 1938 + }, + { + "epoch": 0.2129365253678893, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.2845208644866943, + "learning_rate": 7.093704245973645e-07, + "loss": 0.9735, + "mean_token_accuracy": 0.7115854024887085, + "num_tokens": 50003208.0, + "step": 1939 + }, + { + "epoch": 0.21304634307050296, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.5647621154785156, + "learning_rate": 7.097364568081991e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6913294792175293, + "num_tokens": 50024940.0, + "step": 1940 + }, + { + "epoch": 0.21315616077311664, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.188577890396118, + "learning_rate": 7.101024890190337e-07, + "loss": 0.9279, + "mean_token_accuracy": 0.7208740711212158, + "num_tokens": 50051034.0, + "step": 1941 + }, + { + "epoch": 0.21326597847573028, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.3324475288391113, + "learning_rate": 7.104685212298682e-07, + "loss": 1.0597, + "mean_token_accuracy": 0.6858483552932739, + "num_tokens": 50077175.0, + "step": 1942 + }, + { + "epoch": 0.21337579617834396, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.030261993408203, + "learning_rate": 7.108345534407028e-07, + "loss": 1.1749, + "mean_token_accuracy": 0.6529050469398499, + "num_tokens": 50115357.0, + "step": 1943 + }, + { + "epoch": 0.2134856138809576, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.1990878582000732, + "learning_rate": 7.112005856515373e-07, + "loss": 0.9639, + "mean_token_accuracy": 0.7111510038375854, + "num_tokens": 50143631.0, + "step": 1944 + }, + { + "epoch": 0.21359543158357128, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.5119199752807617, + "learning_rate": 7.115666178623718e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7060058116912842, + "num_tokens": 50167265.0, + "step": 1945 + }, + { + "epoch": 0.21370524928618492, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.5684256553649902, + "learning_rate": 7.119326500732065e-07, + "loss": 1.13, + "mean_token_accuracy": 0.6677508354187012, + "num_tokens": 50191169.0, + "step": 1946 + }, + { + "epoch": 0.2138150669887986, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.250487804412842, + "learning_rate": 7.12298682284041e-07, + "loss": 1.0467, + "mean_token_accuracy": 0.688973605632782, + "num_tokens": 50218920.0, + "step": 1947 + }, + { + "epoch": 0.21392488469141224, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.498464584350586, + "learning_rate": 7.126647144948755e-07, + "loss": 1.1427, + "mean_token_accuracy": 0.6650177240371704, + "num_tokens": 50244790.0, + "step": 1948 + }, + { + "epoch": 0.21403470239402592, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.3153231143951416, + "learning_rate": 7.130307467057101e-07, + "loss": 1.0086, + "mean_token_accuracy": 0.7017388343811035, + "num_tokens": 50270316.0, + "step": 1949 + }, + { + "epoch": 0.2141445200966396, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.300830602645874, + "learning_rate": 7.133967789165446e-07, + "loss": 1.0215, + "mean_token_accuracy": 0.6975135207176208, + "num_tokens": 50296368.0, + "step": 1950 + }, + { + "epoch": 0.21425433779925324, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.6250417232513428, + "learning_rate": 7.137628111273791e-07, + "loss": 1.0028, + "mean_token_accuracy": 0.6946479678153992, + "num_tokens": 50317194.0, + "step": 1951 + }, + { + "epoch": 0.2143641555018669, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 2.508284330368042, + "learning_rate": 7.141288433382138e-07, + "loss": 0.9699, + "mean_token_accuracy": 0.7091149687767029, + "num_tokens": 50338475.0, + "step": 1952 + }, + { + "epoch": 0.21447397320448056, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.382925510406494, + "learning_rate": 7.144948755490483e-07, + "loss": 1.0819, + "mean_token_accuracy": 0.6765031218528748, + "num_tokens": 50363753.0, + "step": 1953 + }, + { + "epoch": 0.21458379090709423, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.4149951934814453, + "learning_rate": 7.148609077598829e-07, + "loss": 0.9808, + "mean_token_accuracy": 0.7033405900001526, + "num_tokens": 50387871.0, + "step": 1954 + }, + { + "epoch": 0.21469360860970788, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.5917880535125732, + "learning_rate": 7.152269399707174e-07, + "loss": 1.0246, + "mean_token_accuracy": 0.6970500946044922, + "num_tokens": 50408307.0, + "step": 1955 + }, + { + "epoch": 0.21480342631232155, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.367154836654663, + "learning_rate": 7.155929721815519e-07, + "loss": 1.0972, + "mean_token_accuracy": 0.6894532442092896, + "num_tokens": 50434392.0, + "step": 1956 + }, + { + "epoch": 0.2149132440149352, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.207031011581421, + "learning_rate": 7.159590043923866e-07, + "loss": 1.0496, + "mean_token_accuracy": 0.6865297555923462, + "num_tokens": 50464382.0, + "step": 1957 + }, + { + "epoch": 0.21502306171754887, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 2.727504014968872, + "learning_rate": 7.163250366032211e-07, + "loss": 1.1512, + "mean_token_accuracy": 0.6590665578842163, + "num_tokens": 50486233.0, + "step": 1958 + }, + { + "epoch": 0.21513287942016254, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.295473575592041, + "learning_rate": 7.166910688140556e-07, + "loss": 1.0947, + "mean_token_accuracy": 0.673313319683075, + "num_tokens": 50516822.0, + "step": 1959 + }, + { + "epoch": 0.2152426971227762, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.510101795196533, + "learning_rate": 7.170571010248902e-07, + "loss": 1.1225, + "mean_token_accuracy": 0.6736338138580322, + "num_tokens": 50541270.0, + "step": 1960 + }, + { + "epoch": 0.21535251482538986, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.333033561706543, + "learning_rate": 7.174231332357247e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.699852466583252, + "num_tokens": 50566736.0, + "step": 1961 + }, + { + "epoch": 0.2154623325280035, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.3852689266204834, + "learning_rate": 7.177891654465593e-07, + "loss": 1.0321, + "mean_token_accuracy": 0.6914111375808716, + "num_tokens": 50591212.0, + "step": 1962 + }, + { + "epoch": 0.21557215023061718, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.5959043502807617, + "learning_rate": 7.181551976573939e-07, + "loss": 0.9562, + "mean_token_accuracy": 0.7115835547447205, + "num_tokens": 50613334.0, + "step": 1963 + }, + { + "epoch": 0.21568196793323083, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.122154712677002, + "learning_rate": 7.185212298682284e-07, + "loss": 0.9627, + "mean_token_accuracy": 0.7105099558830261, + "num_tokens": 50642793.0, + "step": 1964 + }, + { + "epoch": 0.2157917856358445, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.3252644538879395, + "learning_rate": 7.188872620790629e-07, + "loss": 1.132, + "mean_token_accuracy": 0.6781980991363525, + "num_tokens": 50670184.0, + "step": 1965 + }, + { + "epoch": 0.21590160333845815, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.887828826904297, + "learning_rate": 7.192532942898975e-07, + "loss": 0.9556, + "mean_token_accuracy": 0.7064916491508484, + "num_tokens": 50688140.0, + "step": 1966 + }, + { + "epoch": 0.21601142104107182, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 2.419955253601074, + "learning_rate": 7.19619326500732e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.683424174785614, + "num_tokens": 50712549.0, + "step": 1967 + }, + { + "epoch": 0.2161212387436855, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.4382693767547607, + "learning_rate": 7.199853587115667e-07, + "loss": 1.1443, + "mean_token_accuracy": 0.6638689041137695, + "num_tokens": 50738520.0, + "step": 1968 + }, + { + "epoch": 0.21623105644629914, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.348747730255127, + "learning_rate": 7.203513909224012e-07, + "loss": 0.9823, + "mean_token_accuracy": 0.7144097089767456, + "num_tokens": 50763757.0, + "step": 1969 + }, + { + "epoch": 0.21634087414891282, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.154876470565796, + "learning_rate": 7.207174231332357e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.7023401856422424, + "num_tokens": 50793832.0, + "step": 1970 + }, + { + "epoch": 0.21645069185152646, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.6931259632110596, + "learning_rate": 7.210834553440703e-07, + "loss": 0.9531, + "mean_token_accuracy": 0.7098591327667236, + "num_tokens": 50814416.0, + "step": 1971 + }, + { + "epoch": 0.21656050955414013, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.8004841804504395, + "learning_rate": 7.214494875549048e-07, + "loss": 0.9584, + "mean_token_accuracy": 0.7056249976158142, + "num_tokens": 50833389.0, + "step": 1972 + }, + { + "epoch": 0.21667032725675378, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 2.2347795963287354, + "learning_rate": 7.218155197657394e-07, + "loss": 1.1479, + "mean_token_accuracy": 0.6697602868080139, + "num_tokens": 50863979.0, + "step": 1973 + }, + { + "epoch": 0.21678014495936745, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.4748663902282715, + "learning_rate": 7.22181551976574e-07, + "loss": 0.9724, + "mean_token_accuracy": 0.7038378119468689, + "num_tokens": 50886208.0, + "step": 1974 + }, + { + "epoch": 0.2168899626619811, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.5673203468322754, + "learning_rate": 7.225475841874085e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6866936087608337, + "num_tokens": 50908463.0, + "step": 1975 + }, + { + "epoch": 0.21699978036459477, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 2.5710906982421875, + "learning_rate": 7.22913616398243e-07, + "loss": 0.9973, + "mean_token_accuracy": 0.7095439434051514, + "num_tokens": 50931728.0, + "step": 1976 + }, + { + "epoch": 0.21710959806720842, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.294524669647217, + "learning_rate": 7.232796486090776e-07, + "loss": 1.0741, + "mean_token_accuracy": 0.680872917175293, + "num_tokens": 50956836.0, + "step": 1977 + }, + { + "epoch": 0.2172194157698221, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.2060813903808594, + "learning_rate": 7.236456808199122e-07, + "loss": 1.1543, + "mean_token_accuracy": 0.6597827076911926, + "num_tokens": 50988874.0, + "step": 1978 + }, + { + "epoch": 0.21732923347243577, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.3517422676086426, + "learning_rate": 7.240117130307467e-07, + "loss": 1.1248, + "mean_token_accuracy": 0.6825264692306519, + "num_tokens": 51017825.0, + "step": 1979 + }, + { + "epoch": 0.2174390511750494, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.2525429725646973, + "learning_rate": 7.243777452415813e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.70005863904953, + "num_tokens": 51045955.0, + "step": 1980 + }, + { + "epoch": 0.2175488688776631, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.571098566055298, + "learning_rate": 7.247437774524158e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.678805947303772, + "num_tokens": 51069330.0, + "step": 1981 + }, + { + "epoch": 0.21765868658027673, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.692788600921631, + "learning_rate": 7.251098096632503e-07, + "loss": 1.0838, + "mean_token_accuracy": 0.6921013593673706, + "num_tokens": 51092178.0, + "step": 1982 + }, + { + "epoch": 0.2177685042828904, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.4312024116516113, + "learning_rate": 7.254758418740849e-07, + "loss": 1.007, + "mean_token_accuracy": 0.6920769810676575, + "num_tokens": 51115554.0, + "step": 1983 + }, + { + "epoch": 0.21787832198550405, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 2.5262575149536133, + "learning_rate": 7.258418740849195e-07, + "loss": 0.9939, + "mean_token_accuracy": 0.7019551992416382, + "num_tokens": 51135508.0, + "step": 1984 + }, + { + "epoch": 0.21798813968811773, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 7.0492024421691895, + "learning_rate": 7.262079062957541e-07, + "loss": 0.9934, + "mean_token_accuracy": 0.7063193917274475, + "num_tokens": 51159938.0, + "step": 1985 + }, + { + "epoch": 0.21809795739073137, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.252171754837036, + "learning_rate": 7.265739385065886e-07, + "loss": 1.1385, + "mean_token_accuracy": 0.6670327186584473, + "num_tokens": 51189162.0, + "step": 1986 + }, + { + "epoch": 0.21820777509334505, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.5269775390625, + "learning_rate": 7.269399707174231e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.6919703483581543, + "num_tokens": 51213141.0, + "step": 1987 + }, + { + "epoch": 0.21831759279595872, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.2669050693511963, + "learning_rate": 7.273060029282577e-07, + "loss": 1.0516, + "mean_token_accuracy": 0.6874059438705444, + "num_tokens": 51243740.0, + "step": 1988 + }, + { + "epoch": 0.21842741049857237, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.4467275142669678, + "learning_rate": 7.276720351390923e-07, + "loss": 1.0435, + "mean_token_accuracy": 0.6985205411911011, + "num_tokens": 51267656.0, + "step": 1989 + }, + { + "epoch": 0.21853722820118604, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.452791929244995, + "learning_rate": 7.280380673499268e-07, + "loss": 0.9585, + "mean_token_accuracy": 0.7130382061004639, + "num_tokens": 51291884.0, + "step": 1990 + }, + { + "epoch": 0.21864704590379969, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.58526873588562, + "learning_rate": 7.284040995607614e-07, + "loss": 1.0623, + "mean_token_accuracy": 0.6848443746566772, + "num_tokens": 51313957.0, + "step": 1991 + }, + { + "epoch": 0.21875686360641336, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.8463199138641357, + "learning_rate": 7.287701317715959e-07, + "loss": 1.0009, + "mean_token_accuracy": 0.7001370787620544, + "num_tokens": 51333146.0, + "step": 1992 + }, + { + "epoch": 0.218866681309027, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.745173454284668, + "learning_rate": 7.291361639824304e-07, + "loss": 1.0867, + "mean_token_accuracy": 0.679828405380249, + "num_tokens": 51354676.0, + "step": 1993 + }, + { + "epoch": 0.21897649901164068, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.470677137374878, + "learning_rate": 7.295021961932651e-07, + "loss": 1.0133, + "mean_token_accuracy": 0.6995083093643188, + "num_tokens": 51377314.0, + "step": 1994 + }, + { + "epoch": 0.21908631671425433, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 2.458430767059326, + "learning_rate": 7.298682284040996e-07, + "loss": 1.0679, + "mean_token_accuracy": 0.6853029727935791, + "num_tokens": 51404483.0, + "step": 1995 + }, + { + "epoch": 0.219196134416868, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 2.4756784439086914, + "learning_rate": 7.30234260614934e-07, + "loss": 1.0182, + "mean_token_accuracy": 0.69611656665802, + "num_tokens": 51427270.0, + "step": 1996 + }, + { + "epoch": 0.21930595211948167, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 2.572380304336548, + "learning_rate": 7.306002928257687e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.6860998272895813, + "num_tokens": 51451033.0, + "step": 1997 + }, + { + "epoch": 0.21941576982209532, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 2.345184087753296, + "learning_rate": 7.309663250366031e-07, + "loss": 1.0059, + "mean_token_accuracy": 0.7023012042045593, + "num_tokens": 51477625.0, + "step": 1998 + }, + { + "epoch": 0.219525587524709, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 2.6952555179595947, + "learning_rate": 7.313323572474376e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6884139180183411, + "num_tokens": 51499313.0, + "step": 1999 + }, + { + "epoch": 0.21963540522732264, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 32.10643768310547, + "learning_rate": 7.316983894582724e-07, + "loss": 1.0226, + "mean_token_accuracy": 0.7053412795066833, + "num_tokens": 51523584.0, + "step": 2000 + }, + { + "epoch": 0.2197452229299363, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.7848541736602783, + "learning_rate": 7.320644216691068e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6818180084228516, + "num_tokens": 51544804.0, + "step": 2001 + }, + { + "epoch": 0.21985504063254996, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.559675931930542, + "learning_rate": 7.324304538799415e-07, + "loss": 0.9895, + "mean_token_accuracy": 0.7010430097579956, + "num_tokens": 51568871.0, + "step": 2002 + }, + { + "epoch": 0.21996485833516363, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.379936933517456, + "learning_rate": 7.32796486090776e-07, + "loss": 1.032, + "mean_token_accuracy": 0.7044084072113037, + "num_tokens": 51596345.0, + "step": 2003 + }, + { + "epoch": 0.22007467603777728, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.2084827423095703, + "learning_rate": 7.331625183016104e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.6853718757629395, + "num_tokens": 51624644.0, + "step": 2004 + }, + { + "epoch": 0.22018449374039095, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.419459104537964, + "learning_rate": 7.335285505124452e-07, + "loss": 0.9443, + "mean_token_accuracy": 0.7151201367378235, + "num_tokens": 51647338.0, + "step": 2005 + }, + { + "epoch": 0.22029431144300463, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.560816526412964, + "learning_rate": 7.338945827232796e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.6895610690116882, + "num_tokens": 51670588.0, + "step": 2006 + }, + { + "epoch": 0.22040412914561827, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.372199535369873, + "learning_rate": 7.342606149341141e-07, + "loss": 1.014, + "mean_token_accuracy": 0.7083899974822998, + "num_tokens": 51694915.0, + "step": 2007 + }, + { + "epoch": 0.22051394684823195, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.3215930461883545, + "learning_rate": 7.346266471449487e-07, + "loss": 1.1161, + "mean_token_accuracy": 0.66986083984375, + "num_tokens": 51722461.0, + "step": 2008 + }, + { + "epoch": 0.2206237645508456, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.381115674972534, + "learning_rate": 7.349926793557832e-07, + "loss": 1.0064, + "mean_token_accuracy": 0.6977695822715759, + "num_tokens": 51750114.0, + "step": 2009 + }, + { + "epoch": 0.22073358225345927, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.572354793548584, + "learning_rate": 7.353587115666178e-07, + "loss": 0.9871, + "mean_token_accuracy": 0.7046359181404114, + "num_tokens": 51769519.0, + "step": 2010 + }, + { + "epoch": 0.2208433999560729, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.635201930999756, + "learning_rate": 7.357247437774524e-07, + "loss": 0.9705, + "mean_token_accuracy": 0.7137607336044312, + "num_tokens": 51790952.0, + "step": 2011 + }, + { + "epoch": 0.22095321765868658, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.193988561630249, + "learning_rate": 7.360907759882869e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.6996466517448425, + "num_tokens": 51820766.0, + "step": 2012 + }, + { + "epoch": 0.22106303536130023, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.290621757507324, + "learning_rate": 7.364568081991214e-07, + "loss": 1.0933, + "mean_token_accuracy": 0.6812613010406494, + "num_tokens": 51848779.0, + "step": 2013 + }, + { + "epoch": 0.2211728530639139, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.5802547931671143, + "learning_rate": 7.36822840409956e-07, + "loss": 1.066, + "mean_token_accuracy": 0.687449038028717, + "num_tokens": 51872683.0, + "step": 2014 + }, + { + "epoch": 0.22128267076652755, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 2.6487338542938232, + "learning_rate": 7.371888726207905e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.7027897238731384, + "num_tokens": 51892998.0, + "step": 2015 + }, + { + "epoch": 0.22139248846914122, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.5006930828094482, + "learning_rate": 7.375549048316251e-07, + "loss": 1.0112, + "mean_token_accuracy": 0.6967720985412598, + "num_tokens": 51916800.0, + "step": 2016 + }, + { + "epoch": 0.2215023061717549, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.633699655532837, + "learning_rate": 7.379209370424597e-07, + "loss": 0.9865, + "mean_token_accuracy": 0.70676189661026, + "num_tokens": 51938004.0, + "step": 2017 + }, + { + "epoch": 0.22161212387436854, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.575561046600342, + "learning_rate": 7.382869692532942e-07, + "loss": 1.0028, + "mean_token_accuracy": 0.6972218751907349, + "num_tokens": 51957847.0, + "step": 2018 + }, + { + "epoch": 0.22172194157698222, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.5401880741119385, + "learning_rate": 7.386530014641288e-07, + "loss": 0.9919, + "mean_token_accuracy": 0.7036526203155518, + "num_tokens": 51980724.0, + "step": 2019 + }, + { + "epoch": 0.22183175927959586, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.3731658458709717, + "learning_rate": 7.390190336749633e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.7040243744850159, + "num_tokens": 52004838.0, + "step": 2020 + }, + { + "epoch": 0.22194157698220954, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.2978434562683105, + "learning_rate": 7.393850658857979e-07, + "loss": 1.1054, + "mean_token_accuracy": 0.6861253976821899, + "num_tokens": 52031951.0, + "step": 2021 + }, + { + "epoch": 0.22205139468482318, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.6335947513580322, + "learning_rate": 7.397510980966325e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6833438873291016, + "num_tokens": 52055321.0, + "step": 2022 + }, + { + "epoch": 0.22216121238743686, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.438988208770752, + "learning_rate": 7.40117130307467e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.6735108494758606, + "num_tokens": 52081626.0, + "step": 2023 + }, + { + "epoch": 0.2222710300900505, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.9956459999084473, + "learning_rate": 7.404831625183015e-07, + "loss": 1.0747, + "mean_token_accuracy": 0.6844187378883362, + "num_tokens": 52116812.0, + "step": 2024 + }, + { + "epoch": 0.22238084779266418, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.336843967437744, + "learning_rate": 7.408491947291361e-07, + "loss": 1.0774, + "mean_token_accuracy": 0.6922982335090637, + "num_tokens": 52146142.0, + "step": 2025 + }, + { + "epoch": 0.22249066549527785, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.394470691680908, + "learning_rate": 7.412152269399707e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6859540343284607, + "num_tokens": 52171070.0, + "step": 2026 + }, + { + "epoch": 0.2226004831978915, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 2.2999155521392822, + "learning_rate": 7.415812591508052e-07, + "loss": 0.9571, + "mean_token_accuracy": 0.7109149098396301, + "num_tokens": 52198273.0, + "step": 2027 + }, + { + "epoch": 0.22271030090050517, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.5725438594818115, + "learning_rate": 7.419472913616398e-07, + "loss": 1.0322, + "mean_token_accuracy": 0.6942853927612305, + "num_tokens": 52220792.0, + "step": 2028 + }, + { + "epoch": 0.22282011860311882, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.284914970397949, + "learning_rate": 7.423133235724743e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.7063943147659302, + "num_tokens": 52248973.0, + "step": 2029 + }, + { + "epoch": 0.2229299363057325, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.504697322845459, + "learning_rate": 7.426793557833088e-07, + "loss": 1.1046, + "mean_token_accuracy": 0.6763947010040283, + "num_tokens": 52271488.0, + "step": 2030 + }, + { + "epoch": 0.22303975400834614, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.277928113937378, + "learning_rate": 7.430453879941434e-07, + "loss": 1.1447, + "mean_token_accuracy": 0.6641497611999512, + "num_tokens": 52299754.0, + "step": 2031 + }, + { + "epoch": 0.2231495717109598, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 2.4005343914031982, + "learning_rate": 7.43411420204978e-07, + "loss": 1.1057, + "mean_token_accuracy": 0.6784276962280273, + "num_tokens": 52324906.0, + "step": 2032 + }, + { + "epoch": 0.22325938941357346, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.395853042602539, + "learning_rate": 7.437774524158125e-07, + "loss": 0.9945, + "mean_token_accuracy": 0.7012863755226135, + "num_tokens": 52348354.0, + "step": 2033 + }, + { + "epoch": 0.22336920711618713, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.691627264022827, + "learning_rate": 7.441434846266471e-07, + "loss": 1.0433, + "mean_token_accuracy": 0.6887279748916626, + "num_tokens": 52369396.0, + "step": 2034 + }, + { + "epoch": 0.2234790248188008, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.5527822971343994, + "learning_rate": 7.445095168374816e-07, + "loss": 0.8924, + "mean_token_accuracy": 0.7203985452651978, + "num_tokens": 52391080.0, + "step": 2035 + }, + { + "epoch": 0.22358884252141445, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.7086639404296875, + "learning_rate": 7.448755490483162e-07, + "loss": 0.9763, + "mean_token_accuracy": 0.7077159285545349, + "num_tokens": 52410968.0, + "step": 2036 + }, + { + "epoch": 0.22369866022402812, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 2.158508777618408, + "learning_rate": 7.452415812591508e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6693521738052368, + "num_tokens": 52441502.0, + "step": 2037 + }, + { + "epoch": 0.22380847792664177, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.5832273960113525, + "learning_rate": 7.456076134699853e-07, + "loss": 1.053, + "mean_token_accuracy": 0.6866941452026367, + "num_tokens": 52464356.0, + "step": 2038 + }, + { + "epoch": 0.22391829562925544, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.3336849212646484, + "learning_rate": 7.459736456808199e-07, + "loss": 1.042, + "mean_token_accuracy": 0.6915487051010132, + "num_tokens": 52491196.0, + "step": 2039 + }, + { + "epoch": 0.2240281133318691, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.9168848991394043, + "learning_rate": 7.463396778916544e-07, + "loss": 1.0397, + "mean_token_accuracy": 0.691612958908081, + "num_tokens": 52510079.0, + "step": 2040 + }, + { + "epoch": 0.22413793103448276, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.393538475036621, + "learning_rate": 7.467057101024889e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.6773730516433716, + "num_tokens": 52536899.0, + "step": 2041 + }, + { + "epoch": 0.2242477487370964, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.370312213897705, + "learning_rate": 7.470717423133236e-07, + "loss": 1.1273, + "mean_token_accuracy": 0.6733257174491882, + "num_tokens": 52565125.0, + "step": 2042 + }, + { + "epoch": 0.22435756643971008, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.3430051803588867, + "learning_rate": 7.474377745241581e-07, + "loss": 1.0103, + "mean_token_accuracy": 0.7004585266113281, + "num_tokens": 52589724.0, + "step": 2043 + }, + { + "epoch": 0.22446738414232376, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.21191668510437, + "learning_rate": 7.478038067349926e-07, + "loss": 1.1423, + "mean_token_accuracy": 0.6663801670074463, + "num_tokens": 52620905.0, + "step": 2044 + }, + { + "epoch": 0.2245772018449374, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.5538554191589355, + "learning_rate": 7.481698389458272e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.7046924829483032, + "num_tokens": 52647382.0, + "step": 2045 + }, + { + "epoch": 0.22468701954755108, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 2.5839059352874756, + "learning_rate": 7.485358711566617e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6864657402038574, + "num_tokens": 52670035.0, + "step": 2046 + }, + { + "epoch": 0.22479683725016472, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.0897560119628906, + "learning_rate": 7.489019033674962e-07, + "loss": 1.0231, + "mean_token_accuracy": 0.6927273273468018, + "num_tokens": 52702091.0, + "step": 2047 + }, + { + "epoch": 0.2249066549527784, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.5025415420532227, + "learning_rate": 7.492679355783309e-07, + "loss": 0.993, + "mean_token_accuracy": 0.7005960941314697, + "num_tokens": 52723476.0, + "step": 2048 + }, + { + "epoch": 0.22501647265539204, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.265443801879883, + "learning_rate": 7.496339677891654e-07, + "loss": 0.9436, + "mean_token_accuracy": 0.7139152884483337, + "num_tokens": 52750250.0, + "step": 2049 + }, + { + "epoch": 0.22512629035800572, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.415310859680176, + "learning_rate": 7.5e-07, + "loss": 1.0226, + "mean_token_accuracy": 0.6942548751831055, + "num_tokens": 52774913.0, + "step": 2050 + }, + { + "epoch": 0.22523610806061936, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.4547674655914307, + "learning_rate": 7.503660322108345e-07, + "loss": 1.0902, + "mean_token_accuracy": 0.6807018518447876, + "num_tokens": 52799061.0, + "step": 2051 + }, + { + "epoch": 0.22534592576323303, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.2333085536956787, + "learning_rate": 7.50732064421669e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6877169013023376, + "num_tokens": 52827682.0, + "step": 2052 + }, + { + "epoch": 0.22545574346584668, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.5295751094818115, + "learning_rate": 7.510980966325037e-07, + "loss": 1.0936, + "mean_token_accuracy": 0.6757575869560242, + "num_tokens": 52854004.0, + "step": 2053 + }, + { + "epoch": 0.22556556116846035, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.6960906982421875, + "learning_rate": 7.514641288433382e-07, + "loss": 1.0636, + "mean_token_accuracy": 0.6939013004302979, + "num_tokens": 52875334.0, + "step": 2054 + }, + { + "epoch": 0.22567537887107403, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.364152193069458, + "learning_rate": 7.518301610541727e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6934512853622437, + "num_tokens": 52901072.0, + "step": 2055 + }, + { + "epoch": 0.22578519657368767, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.27803897857666, + "learning_rate": 7.521961932650073e-07, + "loss": 1.0537, + "mean_token_accuracy": 0.6921825408935547, + "num_tokens": 52929140.0, + "step": 2056 + }, + { + "epoch": 0.22589501427630135, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.4073004722595215, + "learning_rate": 7.525622254758418e-07, + "loss": 0.9924, + "mean_token_accuracy": 0.7021888494491577, + "num_tokens": 52953279.0, + "step": 2057 + }, + { + "epoch": 0.226004831978915, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.1199865341186523, + "learning_rate": 7.529282576866763e-07, + "loss": 1.0866, + "mean_token_accuracy": 0.6754007339477539, + "num_tokens": 52985489.0, + "step": 2058 + }, + { + "epoch": 0.22611464968152867, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.5799379348754883, + "learning_rate": 7.53294289897511e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6961948871612549, + "num_tokens": 53008432.0, + "step": 2059 + }, + { + "epoch": 0.2262244673841423, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.62681245803833, + "learning_rate": 7.536603221083455e-07, + "loss": 1.0198, + "mean_token_accuracy": 0.7055109143257141, + "num_tokens": 53030196.0, + "step": 2060 + }, + { + "epoch": 0.226334285086756, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.2569937705993652, + "learning_rate": 7.5402635431918e-07, + "loss": 1.0179, + "mean_token_accuracy": 0.7036637663841248, + "num_tokens": 53059442.0, + "step": 2061 + }, + { + "epoch": 0.22644410278936963, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.6382930278778076, + "learning_rate": 7.543923865300146e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.7015098333358765, + "num_tokens": 53081527.0, + "step": 2062 + }, + { + "epoch": 0.2265539204919833, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.239257335662842, + "learning_rate": 7.547584187408491e-07, + "loss": 1.0586, + "mean_token_accuracy": 0.6899538636207581, + "num_tokens": 53109281.0, + "step": 2063 + }, + { + "epoch": 0.22666373819459698, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.334300994873047, + "learning_rate": 7.551244509516837e-07, + "loss": 1.015, + "mean_token_accuracy": 0.7046499848365784, + "num_tokens": 53134575.0, + "step": 2064 + }, + { + "epoch": 0.22677355589721063, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.294090986251831, + "learning_rate": 7.554904831625183e-07, + "loss": 1.0161, + "mean_token_accuracy": 0.6978757381439209, + "num_tokens": 53163685.0, + "step": 2065 + }, + { + "epoch": 0.2268833735998243, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.1972038745880127, + "learning_rate": 7.558565153733528e-07, + "loss": 1.0477, + "mean_token_accuracy": 0.6928793787956238, + "num_tokens": 53193254.0, + "step": 2066 + }, + { + "epoch": 0.22699319130243795, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.5241382122039795, + "learning_rate": 7.562225475841874e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6914492845535278, + "num_tokens": 53216268.0, + "step": 2067 + }, + { + "epoch": 0.22710300900505162, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.423460006713867, + "learning_rate": 7.565885797950219e-07, + "loss": 0.9417, + "mean_token_accuracy": 0.7135815620422363, + "num_tokens": 53239331.0, + "step": 2068 + }, + { + "epoch": 0.22721282670766527, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.2045462131500244, + "learning_rate": 7.569546120058565e-07, + "loss": 1.076, + "mean_token_accuracy": 0.6900632381439209, + "num_tokens": 53269366.0, + "step": 2069 + }, + { + "epoch": 0.22732264441027894, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.5144872665405273, + "learning_rate": 7.573206442166911e-07, + "loss": 1.0343, + "mean_token_accuracy": 0.6935665607452393, + "num_tokens": 53293551.0, + "step": 2070 + }, + { + "epoch": 0.22743246211289259, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.480114698410034, + "learning_rate": 7.576866764275256e-07, + "loss": 0.9825, + "mean_token_accuracy": 0.6997548937797546, + "num_tokens": 53317407.0, + "step": 2071 + }, + { + "epoch": 0.22754227981550626, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.28564453125, + "learning_rate": 7.580527086383601e-07, + "loss": 1.119, + "mean_token_accuracy": 0.6813031435012817, + "num_tokens": 53349129.0, + "step": 2072 + }, + { + "epoch": 0.22765209751811993, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.5569188594818115, + "learning_rate": 7.584187408491947e-07, + "loss": 1.0086, + "mean_token_accuracy": 0.6968008279800415, + "num_tokens": 53373807.0, + "step": 2073 + }, + { + "epoch": 0.22776191522073358, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 2.4786553382873535, + "learning_rate": 7.587847730600292e-07, + "loss": 1.1176, + "mean_token_accuracy": 0.6722843647003174, + "num_tokens": 53398993.0, + "step": 2074 + }, + { + "epoch": 0.22787173292334725, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.1782755851745605, + "learning_rate": 7.591508052708638e-07, + "loss": 1.1288, + "mean_token_accuracy": 0.6722369194030762, + "num_tokens": 53430064.0, + "step": 2075 + }, + { + "epoch": 0.2279815506259609, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.226851463317871, + "learning_rate": 7.595168374816984e-07, + "loss": 1.0038, + "mean_token_accuracy": 0.6943734884262085, + "num_tokens": 53459080.0, + "step": 2076 + }, + { + "epoch": 0.22809136832857457, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.328237771987915, + "learning_rate": 7.598828696925329e-07, + "loss": 1.1238, + "mean_token_accuracy": 0.6729276776313782, + "num_tokens": 53488249.0, + "step": 2077 + }, + { + "epoch": 0.22820118603118822, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.349774122238159, + "learning_rate": 7.602489019033674e-07, + "loss": 0.9071, + "mean_token_accuracy": 0.7243132591247559, + "num_tokens": 53512224.0, + "step": 2078 + }, + { + "epoch": 0.2283110037338019, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.3703598976135254, + "learning_rate": 7.60614934114202e-07, + "loss": 1.0596, + "mean_token_accuracy": 0.6833564043045044, + "num_tokens": 53539216.0, + "step": 2079 + }, + { + "epoch": 0.22842082143641554, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.7615156173706055, + "learning_rate": 7.609809663250366e-07, + "loss": 0.9719, + "mean_token_accuracy": 0.7064208984375, + "num_tokens": 53558834.0, + "step": 2080 + }, + { + "epoch": 0.2285306391390292, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.171339750289917, + "learning_rate": 7.613469985358711e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.689433217048645, + "num_tokens": 53587249.0, + "step": 2081 + }, + { + "epoch": 0.2286404568416429, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.5824267864227295, + "learning_rate": 7.617130307467057e-07, + "loss": 1.003, + "mean_token_accuracy": 0.7009072303771973, + "num_tokens": 53610048.0, + "step": 2082 + }, + { + "epoch": 0.22875027454425653, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.239223003387451, + "learning_rate": 7.620790629575402e-07, + "loss": 1.025, + "mean_token_accuracy": 0.6975679397583008, + "num_tokens": 53639322.0, + "step": 2083 + }, + { + "epoch": 0.2288600922468702, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.62349009513855, + "learning_rate": 7.624450951683748e-07, + "loss": 0.9745, + "mean_token_accuracy": 0.7087594866752625, + "num_tokens": 53661912.0, + "step": 2084 + }, + { + "epoch": 0.22896990994948385, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.315213203430176, + "learning_rate": 7.628111273792094e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.7023458480834961, + "num_tokens": 53692100.0, + "step": 2085 + }, + { + "epoch": 0.22907972765209753, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 2.2191615104675293, + "learning_rate": 7.631771595900439e-07, + "loss": 1.0773, + "mean_token_accuracy": 0.6823104023933411, + "num_tokens": 53721317.0, + "step": 2086 + }, + { + "epoch": 0.22918954535471117, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.523301362991333, + "learning_rate": 7.635431918008785e-07, + "loss": 1.0848, + "mean_token_accuracy": 0.6889328956604004, + "num_tokens": 53745152.0, + "step": 2087 + }, + { + "epoch": 0.22929936305732485, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.7519028186798096, + "learning_rate": 7.63909224011713e-07, + "loss": 0.9861, + "mean_token_accuracy": 0.7100411653518677, + "num_tokens": 53766720.0, + "step": 2088 + }, + { + "epoch": 0.2294091807599385, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.2416226863861084, + "learning_rate": 7.642752562225475e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.6950401067733765, + "num_tokens": 53794504.0, + "step": 2089 + }, + { + "epoch": 0.22951899846255217, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.4071052074432373, + "learning_rate": 7.646412884333821e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6954939961433411, + "num_tokens": 53820018.0, + "step": 2090 + }, + { + "epoch": 0.2296288161651658, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.118565082550049, + "learning_rate": 7.650073206442167e-07, + "loss": 1.0669, + "mean_token_accuracy": 0.687107264995575, + "num_tokens": 53851435.0, + "step": 2091 + }, + { + "epoch": 0.22973863386777948, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.595736265182495, + "learning_rate": 7.653733528550512e-07, + "loss": 0.9966, + "mean_token_accuracy": 0.701008677482605, + "num_tokens": 53873241.0, + "step": 2092 + }, + { + "epoch": 0.22984845157039316, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 2.1372592449188232, + "learning_rate": 7.657393850658858e-07, + "loss": 1.0688, + "mean_token_accuracy": 0.6876201629638672, + "num_tokens": 53903030.0, + "step": 2093 + }, + { + "epoch": 0.2299582692730068, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.683487892150879, + "learning_rate": 7.661054172767203e-07, + "loss": 1.0354, + "mean_token_accuracy": 0.694661021232605, + "num_tokens": 53925218.0, + "step": 2094 + }, + { + "epoch": 0.23006808697562048, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 2.5258169174194336, + "learning_rate": 7.664714494875548e-07, + "loss": 1.0926, + "mean_token_accuracy": 0.682753324508667, + "num_tokens": 53948843.0, + "step": 2095 + }, + { + "epoch": 0.23017790467823412, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1877567768096924, + "learning_rate": 7.668374816983895e-07, + "loss": 0.9901, + "mean_token_accuracy": 0.7020261883735657, + "num_tokens": 53977112.0, + "step": 2096 + }, + { + "epoch": 0.2302877223808478, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.4338269233703613, + "learning_rate": 7.67203513909224e-07, + "loss": 1.0207, + "mean_token_accuracy": 0.7022558450698853, + "num_tokens": 54002810.0, + "step": 2097 + }, + { + "epoch": 0.23039754008346144, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.361656665802002, + "learning_rate": 7.675695461200585e-07, + "loss": 1.0175, + "mean_token_accuracy": 0.7050776481628418, + "num_tokens": 54029780.0, + "step": 2098 + }, + { + "epoch": 0.23050735778607512, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.3997843265533447, + "learning_rate": 7.679355783308931e-07, + "loss": 1.0925, + "mean_token_accuracy": 0.6848894953727722, + "num_tokens": 54054899.0, + "step": 2099 + }, + { + "epoch": 0.23061717548868876, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1949472427368164, + "learning_rate": 7.683016105417276e-07, + "loss": 1.1153, + "mean_token_accuracy": 0.6816962361335754, + "num_tokens": 54084137.0, + "step": 2100 + }, + { + "epoch": 0.23072699319130244, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.4281699657440186, + "learning_rate": 7.686676427525623e-07, + "loss": 0.9749, + "mean_token_accuracy": 0.7151771783828735, + "num_tokens": 54108823.0, + "step": 2101 + }, + { + "epoch": 0.2308368108939161, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.5488576889038086, + "learning_rate": 7.690336749633968e-07, + "loss": 0.9958, + "mean_token_accuracy": 0.7014980316162109, + "num_tokens": 54131262.0, + "step": 2102 + }, + { + "epoch": 0.23094662859652976, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.523972272872925, + "learning_rate": 7.693997071742313e-07, + "loss": 1.0283, + "mean_token_accuracy": 0.6917296051979065, + "num_tokens": 54154640.0, + "step": 2103 + }, + { + "epoch": 0.23105644629914343, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.1984622478485107, + "learning_rate": 7.697657393850659e-07, + "loss": 1.0091, + "mean_token_accuracy": 0.6995307207107544, + "num_tokens": 54185330.0, + "step": 2104 + }, + { + "epoch": 0.23116626400175708, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.648012638092041, + "learning_rate": 7.701317715959004e-07, + "loss": 0.9792, + "mean_token_accuracy": 0.7077256441116333, + "num_tokens": 54206266.0, + "step": 2105 + }, + { + "epoch": 0.23127608170437075, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.259422540664673, + "learning_rate": 7.704978038067349e-07, + "loss": 1.09, + "mean_token_accuracy": 0.6748136281967163, + "num_tokens": 54234830.0, + "step": 2106 + }, + { + "epoch": 0.2313858994069844, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.4108734130859375, + "learning_rate": 7.708638360175696e-07, + "loss": 0.9025, + "mean_token_accuracy": 0.7229263782501221, + "num_tokens": 54258518.0, + "step": 2107 + }, + { + "epoch": 0.23149571710959807, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.273742914199829, + "learning_rate": 7.712298682284041e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6794631481170654, + "num_tokens": 54285139.0, + "step": 2108 + }, + { + "epoch": 0.23160553481221172, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.5403244495391846, + "learning_rate": 7.715959004392386e-07, + "loss": 0.9988, + "mean_token_accuracy": 0.7018237113952637, + "num_tokens": 54308341.0, + "step": 2109 + }, + { + "epoch": 0.2317153525148254, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.330481767654419, + "learning_rate": 7.719619326500732e-07, + "loss": 1.1491, + "mean_token_accuracy": 0.6636692881584167, + "num_tokens": 54338009.0, + "step": 2110 + }, + { + "epoch": 0.23182517021743906, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.5825979709625244, + "learning_rate": 7.723279648609077e-07, + "loss": 1.0983, + "mean_token_accuracy": 0.6787366271018982, + "num_tokens": 54361156.0, + "step": 2111 + }, + { + "epoch": 0.2319349879200527, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.0690646171569824, + "learning_rate": 7.726939970717423e-07, + "loss": 0.9076, + "mean_token_accuracy": 0.7264351844787598, + "num_tokens": 54390645.0, + "step": 2112 + }, + { + "epoch": 0.23204480562266638, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.541682481765747, + "learning_rate": 7.730600292825769e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.6939215660095215, + "num_tokens": 54412841.0, + "step": 2113 + }, + { + "epoch": 0.23215462332528003, + "ewc_loss": 5.751848220825195e-06, + "grad_norm": 2.2592790126800537, + "learning_rate": 7.734260614934114e-07, + "loss": 0.9849, + "mean_token_accuracy": 0.7040327787399292, + "num_tokens": 54440731.0, + "step": 2114 + }, + { + "epoch": 0.2322644410278937, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.7645506858825684, + "learning_rate": 7.737920937042459e-07, + "loss": 1.0266, + "mean_token_accuracy": 0.693942666053772, + "num_tokens": 54460803.0, + "step": 2115 + }, + { + "epoch": 0.23237425873050735, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.7092652320861816, + "learning_rate": 7.741581259150805e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.7062537670135498, + "num_tokens": 54483265.0, + "step": 2116 + }, + { + "epoch": 0.23248407643312102, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.3117051124572754, + "learning_rate": 7.745241581259151e-07, + "loss": 1.001, + "mean_token_accuracy": 0.7011712789535522, + "num_tokens": 54509995.0, + "step": 2117 + }, + { + "epoch": 0.23259389413573467, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.5531253814697266, + "learning_rate": 7.748901903367497e-07, + "loss": 0.9982, + "mean_token_accuracy": 0.6990505456924438, + "num_tokens": 54533911.0, + "step": 2118 + }, + { + "epoch": 0.23270371183834834, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.9852423667907715, + "learning_rate": 7.752562225475842e-07, + "loss": 1.0054, + "mean_token_accuracy": 0.6983813643455505, + "num_tokens": 54555628.0, + "step": 2119 + }, + { + "epoch": 0.23281352954096202, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 2.3350369930267334, + "learning_rate": 7.756222547584187e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.6906818747520447, + "num_tokens": 54584663.0, + "step": 2120 + }, + { + "epoch": 0.23292334724357566, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.7865381240844727, + "learning_rate": 7.759882869692533e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.7011660933494568, + "num_tokens": 54606332.0, + "step": 2121 + }, + { + "epoch": 0.23303316494618934, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.1184589862823486, + "learning_rate": 7.763543191800878e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6757371425628662, + "num_tokens": 54639008.0, + "step": 2122 + }, + { + "epoch": 0.23314298264880298, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.2184600830078125, + "learning_rate": 7.767203513909224e-07, + "loss": 1.0569, + "mean_token_accuracy": 0.6851350665092468, + "num_tokens": 54668941.0, + "step": 2123 + }, + { + "epoch": 0.23325280035141666, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.6034963130950928, + "learning_rate": 7.77086383601757e-07, + "loss": 1.0061, + "mean_token_accuracy": 0.6982555389404297, + "num_tokens": 54691348.0, + "step": 2124 + }, + { + "epoch": 0.2333626180540303, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3398971557617188, + "learning_rate": 7.774524158125915e-07, + "loss": 0.9977, + "mean_token_accuracy": 0.704623281955719, + "num_tokens": 54715840.0, + "step": 2125 + }, + { + "epoch": 0.23347243575664398, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3714401721954346, + "learning_rate": 7.77818448023426e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.665310263633728, + "num_tokens": 54742261.0, + "step": 2126 + }, + { + "epoch": 0.23358225345925762, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.0664310455322266, + "learning_rate": 7.781844802342606e-07, + "loss": 1.1069, + "mean_token_accuracy": 0.6720368266105652, + "num_tokens": 54775001.0, + "step": 2127 + }, + { + "epoch": 0.2336920711618713, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.4514472484588623, + "learning_rate": 7.785505124450952e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.6812810301780701, + "num_tokens": 54800287.0, + "step": 2128 + }, + { + "epoch": 0.23380188886448494, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3722341060638428, + "learning_rate": 7.789165446559297e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.6808289885520935, + "num_tokens": 54827212.0, + "step": 2129 + }, + { + "epoch": 0.23391170656709862, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.4048001766204834, + "learning_rate": 7.792825768667643e-07, + "loss": 1.176, + "mean_token_accuracy": 0.657677412033081, + "num_tokens": 54855265.0, + "step": 2130 + }, + { + "epoch": 0.2340215242697123, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.524193525314331, + "learning_rate": 7.796486090775988e-07, + "loss": 1.0869, + "mean_token_accuracy": 0.6764020919799805, + "num_tokens": 54880558.0, + "step": 2131 + }, + { + "epoch": 0.23413134197232593, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.264415979385376, + "learning_rate": 7.800146412884333e-07, + "loss": 1.1192, + "mean_token_accuracy": 0.6739200353622437, + "num_tokens": 54908760.0, + "step": 2132 + }, + { + "epoch": 0.2342411596749396, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.6335604190826416, + "learning_rate": 7.80380673499268e-07, + "loss": 0.9355, + "mean_token_accuracy": 0.7134367823600769, + "num_tokens": 54927989.0, + "step": 2133 + }, + { + "epoch": 0.23435097737755325, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3100578784942627, + "learning_rate": 7.807467057101025e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6857762336730957, + "num_tokens": 54955289.0, + "step": 2134 + }, + { + "epoch": 0.23446079508016693, + "ewc_loss": 5.811452865600586e-06, + "grad_norm": 2.3996803760528564, + "learning_rate": 7.811127379209371e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.6986664533615112, + "num_tokens": 54981124.0, + "step": 2135 + }, + { + "epoch": 0.23457061278278057, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.2172651290893555, + "learning_rate": 7.814787701317716e-07, + "loss": 1.0842, + "mean_token_accuracy": 0.6817542910575867, + "num_tokens": 55009205.0, + "step": 2136 + }, + { + "epoch": 0.23468043048539425, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.3706893920898438, + "learning_rate": 7.818448023426061e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.6671284437179565, + "num_tokens": 55036118.0, + "step": 2137 + }, + { + "epoch": 0.2347902481880079, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.3638713359832764, + "learning_rate": 7.822108345534407e-07, + "loss": 0.9721, + "mean_token_accuracy": 0.7039503455162048, + "num_tokens": 55060467.0, + "step": 2138 + }, + { + "epoch": 0.23490006589062157, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.372084379196167, + "learning_rate": 7.825768667642753e-07, + "loss": 1.1124, + "mean_token_accuracy": 0.6733094453811646, + "num_tokens": 55087375.0, + "step": 2139 + }, + { + "epoch": 0.23500988359323524, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.3290703296661377, + "learning_rate": 7.829428989751098e-07, + "loss": 1.0449, + "mean_token_accuracy": 0.6957300901412964, + "num_tokens": 55112810.0, + "step": 2140 + }, + { + "epoch": 0.2351197012958489, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 2.9839658737182617, + "learning_rate": 7.833089311859444e-07, + "loss": 1.0371, + "mean_token_accuracy": 0.6984206438064575, + "num_tokens": 55134221.0, + "step": 2141 + }, + { + "epoch": 0.23522951899846256, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.2883660793304443, + "learning_rate": 7.836749633967789e-07, + "loss": 1.0456, + "mean_token_accuracy": 0.6942622661590576, + "num_tokens": 55161282.0, + "step": 2142 + }, + { + "epoch": 0.2353393367010762, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.4794020652770996, + "learning_rate": 7.840409956076133e-07, + "loss": 1.0676, + "mean_token_accuracy": 0.687140941619873, + "num_tokens": 55185586.0, + "step": 2143 + }, + { + "epoch": 0.23544915440368988, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.3345205783843994, + "learning_rate": 7.844070278184481e-07, + "loss": 1.1702, + "mean_token_accuracy": 0.6623575687408447, + "num_tokens": 55214146.0, + "step": 2144 + }, + { + "epoch": 0.23555897210630353, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.6264944076538086, + "learning_rate": 7.847730600292826e-07, + "loss": 1.0176, + "mean_token_accuracy": 0.6931856274604797, + "num_tokens": 55234676.0, + "step": 2145 + }, + { + "epoch": 0.2356687898089172, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.445032835006714, + "learning_rate": 7.85139092240117e-07, + "loss": 0.951, + "mean_token_accuracy": 0.7107672095298767, + "num_tokens": 55257852.0, + "step": 2146 + }, + { + "epoch": 0.23577860751153085, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.5195505619049072, + "learning_rate": 7.855051244509517e-07, + "loss": 1.0328, + "mean_token_accuracy": 0.6924071907997131, + "num_tokens": 55280103.0, + "step": 2147 + }, + { + "epoch": 0.23588842521414452, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.3301353454589844, + "learning_rate": 7.858711566617861e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6797364950180054, + "num_tokens": 55306689.0, + "step": 2148 + }, + { + "epoch": 0.2359982429167582, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.2985026836395264, + "learning_rate": 7.862371888726209e-07, + "loss": 1.1051, + "mean_token_accuracy": 0.6755602955818176, + "num_tokens": 55335557.0, + "step": 2149 + }, + { + "epoch": 0.23610806061937184, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.4271905422210693, + "learning_rate": 7.866032210834554e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6882010698318481, + "num_tokens": 55361347.0, + "step": 2150 + }, + { + "epoch": 0.2362178783219855, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.388502836227417, + "learning_rate": 7.869692532942898e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6875213384628296, + "num_tokens": 55386252.0, + "step": 2151 + }, + { + "epoch": 0.23632769602459916, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.4978015422821045, + "learning_rate": 7.873352855051244e-07, + "loss": 1.0726, + "mean_token_accuracy": 0.6926178932189941, + "num_tokens": 55407899.0, + "step": 2152 + }, + { + "epoch": 0.23643751372721283, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.4168827533721924, + "learning_rate": 7.877013177159589e-07, + "loss": 0.8849, + "mean_token_accuracy": 0.7290157079696655, + "num_tokens": 55430617.0, + "step": 2153 + }, + { + "epoch": 0.23654733142982648, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 2.628532886505127, + "learning_rate": 7.880673499267934e-07, + "loss": 0.8728, + "mean_token_accuracy": 0.7310020923614502, + "num_tokens": 55449753.0, + "step": 2154 + }, + { + "epoch": 0.23665714913244015, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.303276300430298, + "learning_rate": 7.884333821376281e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.6876648664474487, + "num_tokens": 55477552.0, + "step": 2155 + }, + { + "epoch": 0.2367669668350538, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.4413721561431885, + "learning_rate": 7.887994143484626e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.701543927192688, + "num_tokens": 55500986.0, + "step": 2156 + }, + { + "epoch": 0.23687678453766747, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.211790084838867, + "learning_rate": 7.891654465592971e-07, + "loss": 1.0715, + "mean_token_accuracy": 0.6858483552932739, + "num_tokens": 55533557.0, + "step": 2157 + }, + { + "epoch": 0.23698660224028115, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.268832206726074, + "learning_rate": 7.895314787701317e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6810089349746704, + "num_tokens": 55562766.0, + "step": 2158 + }, + { + "epoch": 0.2370964199428948, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.349458694458008, + "learning_rate": 7.898975109809662e-07, + "loss": 1.1202, + "mean_token_accuracy": 0.6700800657272339, + "num_tokens": 55590781.0, + "step": 2159 + }, + { + "epoch": 0.23720623764550847, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.485570192337036, + "learning_rate": 7.902635431918008e-07, + "loss": 1.0159, + "mean_token_accuracy": 0.6975610256195068, + "num_tokens": 55615755.0, + "step": 2160 + }, + { + "epoch": 0.2373160553481221, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.4243149757385254, + "learning_rate": 7.906295754026354e-07, + "loss": 1.0064, + "mean_token_accuracy": 0.7004568576812744, + "num_tokens": 55639958.0, + "step": 2161 + }, + { + "epoch": 0.2374258730507358, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.681870937347412, + "learning_rate": 7.909956076134699e-07, + "loss": 1.0808, + "mean_token_accuracy": 0.6799421310424805, + "num_tokens": 55661257.0, + "step": 2162 + }, + { + "epoch": 0.23753569075334943, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.308457612991333, + "learning_rate": 7.913616398243044e-07, + "loss": 0.9579, + "mean_token_accuracy": 0.7206671237945557, + "num_tokens": 55687147.0, + "step": 2163 + }, + { + "epoch": 0.2376455084559631, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.1129746437072754, + "learning_rate": 7.91727672035139e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.6844472885131836, + "num_tokens": 55718756.0, + "step": 2164 + }, + { + "epoch": 0.23775532615857675, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.3835020065307617, + "learning_rate": 7.920937042459736e-07, + "loss": 1.1171, + "mean_token_accuracy": 0.6738952398300171, + "num_tokens": 55745692.0, + "step": 2165 + }, + { + "epoch": 0.23786514386119043, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.336568593978882, + "learning_rate": 7.924597364568082e-07, + "loss": 1.0111, + "mean_token_accuracy": 0.705122172832489, + "num_tokens": 55771595.0, + "step": 2166 + }, + { + "epoch": 0.23797496156380407, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.3546714782714844, + "learning_rate": 7.928257686676427e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6880346536636353, + "num_tokens": 55795574.0, + "step": 2167 + }, + { + "epoch": 0.23808477926641775, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.3029141426086426, + "learning_rate": 7.931918008784772e-07, + "loss": 1.0783, + "mean_token_accuracy": 0.683659553527832, + "num_tokens": 55823116.0, + "step": 2168 + }, + { + "epoch": 0.23819459696903142, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.280902147293091, + "learning_rate": 7.935578330893118e-07, + "loss": 1.0483, + "mean_token_accuracy": 0.6855748295783997, + "num_tokens": 55849908.0, + "step": 2169 + }, + { + "epoch": 0.23830441467164507, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.1955513954162598, + "learning_rate": 7.939238653001463e-07, + "loss": 1.0744, + "mean_token_accuracy": 0.6822623014450073, + "num_tokens": 55879307.0, + "step": 2170 + }, + { + "epoch": 0.23841423237425874, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.4610188007354736, + "learning_rate": 7.942898975109809e-07, + "loss": 1.1139, + "mean_token_accuracy": 0.6759532690048218, + "num_tokens": 55907879.0, + "step": 2171 + }, + { + "epoch": 0.23852405007687238, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.600184202194214, + "learning_rate": 7.946559297218155e-07, + "loss": 1.126, + "mean_token_accuracy": 0.6762515306472778, + "num_tokens": 55930065.0, + "step": 2172 + }, + { + "epoch": 0.23863386777948606, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.4459657669067383, + "learning_rate": 7.9502196193265e-07, + "loss": 1.0247, + "mean_token_accuracy": 0.6950133442878723, + "num_tokens": 55954500.0, + "step": 2173 + }, + { + "epoch": 0.2387436854820997, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.3431310653686523, + "learning_rate": 7.953879941434845e-07, + "loss": 0.9519, + "mean_token_accuracy": 0.71195387840271, + "num_tokens": 55980294.0, + "step": 2174 + }, + { + "epoch": 0.23885350318471338, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.097388982772827, + "learning_rate": 7.957540263543191e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6957021355628967, + "num_tokens": 56012046.0, + "step": 2175 + }, + { + "epoch": 0.23896332088732702, + "ewc_loss": 5.900859832763672e-06, + "grad_norm": 2.329885482788086, + "learning_rate": 7.961200585651537e-07, + "loss": 1.0244, + "mean_token_accuracy": 0.6963669061660767, + "num_tokens": 56039172.0, + "step": 2176 + }, + { + "epoch": 0.2390731385899407, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.590654134750366, + "learning_rate": 7.964860907759882e-07, + "loss": 1.0172, + "mean_token_accuracy": 0.7019309401512146, + "num_tokens": 56061746.0, + "step": 2177 + }, + { + "epoch": 0.23918295629255437, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.590606451034546, + "learning_rate": 7.968521229868228e-07, + "loss": 1.0599, + "mean_token_accuracy": 0.6811516284942627, + "num_tokens": 56086553.0, + "step": 2178 + }, + { + "epoch": 0.23929277399516802, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.263356924057007, + "learning_rate": 7.972181551976573e-07, + "loss": 1.1159, + "mean_token_accuracy": 0.6732342839241028, + "num_tokens": 56118829.0, + "step": 2179 + }, + { + "epoch": 0.2394025916977817, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.297943592071533, + "learning_rate": 7.975841874084918e-07, + "loss": 1.063, + "mean_token_accuracy": 0.6856201887130737, + "num_tokens": 56145714.0, + "step": 2180 + }, + { + "epoch": 0.23951240940039534, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.4780051708221436, + "learning_rate": 7.979502196193265e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.6822187900543213, + "num_tokens": 56170868.0, + "step": 2181 + }, + { + "epoch": 0.239622227103009, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.4126527309417725, + "learning_rate": 7.98316251830161e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.6941251158714294, + "num_tokens": 56195851.0, + "step": 2182 + }, + { + "epoch": 0.23973204480562266, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.1986136436462402, + "learning_rate": 7.986822840409956e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6867532730102539, + "num_tokens": 56227161.0, + "step": 2183 + }, + { + "epoch": 0.23984186250823633, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.5862786769866943, + "learning_rate": 7.990483162518301e-07, + "loss": 1.1034, + "mean_token_accuracy": 0.6772201061248779, + "num_tokens": 56249842.0, + "step": 2184 + }, + { + "epoch": 0.23995168021084998, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.3276772499084473, + "learning_rate": 7.994143484626646e-07, + "loss": 1.059, + "mean_token_accuracy": 0.6882447004318237, + "num_tokens": 56276810.0, + "step": 2185 + }, + { + "epoch": 0.24006149791346365, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 3.4255433082580566, + "learning_rate": 7.997803806734992e-07, + "loss": 1.0084, + "mean_token_accuracy": 0.6972246170043945, + "num_tokens": 56301375.0, + "step": 2186 + }, + { + "epoch": 0.24017131561607732, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.2968454360961914, + "learning_rate": 8.001464128843338e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.6836037635803223, + "num_tokens": 56327061.0, + "step": 2187 + }, + { + "epoch": 0.24028113331869097, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.2168023586273193, + "learning_rate": 8.005124450951683e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.693838357925415, + "num_tokens": 56357185.0, + "step": 2188 + }, + { + "epoch": 0.24039095102130464, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.4918932914733887, + "learning_rate": 8.008784773060029e-07, + "loss": 1.06, + "mean_token_accuracy": 0.6855409145355225, + "num_tokens": 56380411.0, + "step": 2189 + }, + { + "epoch": 0.2405007687239183, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.580514669418335, + "learning_rate": 8.012445095168374e-07, + "loss": 1.0175, + "mean_token_accuracy": 0.7026712894439697, + "num_tokens": 56403539.0, + "step": 2190 + }, + { + "epoch": 0.24061058642653196, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.312648057937622, + "learning_rate": 8.016105417276719e-07, + "loss": 1.0238, + "mean_token_accuracy": 0.700609564781189, + "num_tokens": 56430207.0, + "step": 2191 + }, + { + "epoch": 0.2407204041291456, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.1281232833862305, + "learning_rate": 8.019765739385066e-07, + "loss": 1.0992, + "mean_token_accuracy": 0.6735260486602783, + "num_tokens": 56460386.0, + "step": 2192 + }, + { + "epoch": 0.24083022183175928, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.287872552871704, + "learning_rate": 8.023426061493411e-07, + "loss": 0.949, + "mean_token_accuracy": 0.7165433168411255, + "num_tokens": 56487140.0, + "step": 2193 + }, + { + "epoch": 0.24094003953437293, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.641547203063965, + "learning_rate": 8.027086383601756e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.6693764328956604, + "num_tokens": 56508781.0, + "step": 2194 + }, + { + "epoch": 0.2410498572369866, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.5011940002441406, + "learning_rate": 8.030746705710102e-07, + "loss": 1.0568, + "mean_token_accuracy": 0.6857101321220398, + "num_tokens": 56532472.0, + "step": 2195 + }, + { + "epoch": 0.24115967493960028, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.059109687805176, + "learning_rate": 8.034407027818447e-07, + "loss": 1.0909, + "mean_token_accuracy": 0.6817746758460999, + "num_tokens": 56567185.0, + "step": 2196 + }, + { + "epoch": 0.24126949264221392, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.1287832260131836, + "learning_rate": 8.038067349926793e-07, + "loss": 0.9993, + "mean_token_accuracy": 0.7059749364852905, + "num_tokens": 56596186.0, + "step": 2197 + }, + { + "epoch": 0.2413793103448276, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 4.401136875152588, + "learning_rate": 8.041727672035139e-07, + "loss": 0.9832, + "mean_token_accuracy": 0.6998525857925415, + "num_tokens": 56621026.0, + "step": 2198 + }, + { + "epoch": 0.24148912804744124, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.615692377090454, + "learning_rate": 8.045387994143484e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.699421226978302, + "num_tokens": 56641149.0, + "step": 2199 + }, + { + "epoch": 0.24159894575005492, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.46952223777771, + "learning_rate": 8.04904831625183e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.701615571975708, + "num_tokens": 56663743.0, + "step": 2200 + }, + { + "epoch": 0.24170876345266856, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 2.211066722869873, + "learning_rate": 8.052708638360175e-07, + "loss": 0.981, + "mean_token_accuracy": 0.7154044508934021, + "num_tokens": 56690487.0, + "step": 2201 + }, + { + "epoch": 0.24181858115528224, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.514308452606201, + "learning_rate": 8.05636896046852e-07, + "loss": 1.0017, + "mean_token_accuracy": 0.6949545741081238, + "num_tokens": 56714174.0, + "step": 2202 + }, + { + "epoch": 0.24192839885789588, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.280526876449585, + "learning_rate": 8.060029282576867e-07, + "loss": 1.1056, + "mean_token_accuracy": 0.6763481497764587, + "num_tokens": 56743225.0, + "step": 2203 + }, + { + "epoch": 0.24203821656050956, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.3832576274871826, + "learning_rate": 8.063689604685212e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6804009079933167, + "num_tokens": 56769588.0, + "step": 2204 + }, + { + "epoch": 0.2421480342631232, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.4125545024871826, + "learning_rate": 8.067349926793557e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.6948444247245789, + "num_tokens": 56793483.0, + "step": 2205 + }, + { + "epoch": 0.24225785196573688, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.225189685821533, + "learning_rate": 8.071010248901903e-07, + "loss": 1.0801, + "mean_token_accuracy": 0.6893396377563477, + "num_tokens": 56821789.0, + "step": 2206 + }, + { + "epoch": 0.24236766966835055, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.4350392818450928, + "learning_rate": 8.074670571010248e-07, + "loss": 0.9736, + "mean_token_accuracy": 0.7057482004165649, + "num_tokens": 56847788.0, + "step": 2207 + }, + { + "epoch": 0.2424774873709642, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.3369905948638916, + "learning_rate": 8.078330893118594e-07, + "loss": 0.9834, + "mean_token_accuracy": 0.7075312733650208, + "num_tokens": 56874038.0, + "step": 2208 + }, + { + "epoch": 0.24258730507357787, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.5473079681396484, + "learning_rate": 8.08199121522694e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6891316175460815, + "num_tokens": 56895943.0, + "step": 2209 + }, + { + "epoch": 0.24269712277619152, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.438215494155884, + "learning_rate": 8.085651537335285e-07, + "loss": 0.9998, + "mean_token_accuracy": 0.6993156671524048, + "num_tokens": 56920459.0, + "step": 2210 + }, + { + "epoch": 0.2428069404788052, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 2.3961682319641113, + "learning_rate": 8.08931185944363e-07, + "loss": 1.128, + "mean_token_accuracy": 0.671722412109375, + "num_tokens": 56947307.0, + "step": 2211 + }, + { + "epoch": 0.24291675818141883, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.3108487129211426, + "learning_rate": 8.092972181551976e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6882354617118835, + "num_tokens": 56974344.0, + "step": 2212 + }, + { + "epoch": 0.2430265758840325, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.1837093830108643, + "learning_rate": 8.096632503660322e-07, + "loss": 1.177, + "mean_token_accuracy": 0.6664236187934875, + "num_tokens": 57005194.0, + "step": 2213 + }, + { + "epoch": 0.24313639358664615, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.522069215774536, + "learning_rate": 8.100292825768668e-07, + "loss": 0.9732, + "mean_token_accuracy": 0.7019119262695312, + "num_tokens": 57027359.0, + "step": 2214 + }, + { + "epoch": 0.24324621128925983, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.5319817066192627, + "learning_rate": 8.103953147877013e-07, + "loss": 1.0082, + "mean_token_accuracy": 0.6993172764778137, + "num_tokens": 57050898.0, + "step": 2215 + }, + { + "epoch": 0.2433560289918735, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.5150201320648193, + "learning_rate": 8.107613469985358e-07, + "loss": 0.9801, + "mean_token_accuracy": 0.7003560066223145, + "num_tokens": 57075058.0, + "step": 2216 + }, + { + "epoch": 0.24346584669448715, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.3759384155273438, + "learning_rate": 8.111273792093704e-07, + "loss": 0.9753, + "mean_token_accuracy": 0.7079771757125854, + "num_tokens": 57100386.0, + "step": 2217 + }, + { + "epoch": 0.24357566439710082, + "ewc_loss": 5.990266799926758e-06, + "grad_norm": 2.3591363430023193, + "learning_rate": 8.114934114202049e-07, + "loss": 1.0142, + "mean_token_accuracy": 0.6954210996627808, + "num_tokens": 57126772.0, + "step": 2218 + }, + { + "epoch": 0.24368548209971447, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.454411268234253, + "learning_rate": 8.118594436310395e-07, + "loss": 1.0606, + "mean_token_accuracy": 0.6876386404037476, + "num_tokens": 57152044.0, + "step": 2219 + }, + { + "epoch": 0.24379529980232814, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.36889910697937, + "learning_rate": 8.122254758418741e-07, + "loss": 1.0614, + "mean_token_accuracy": 0.691453218460083, + "num_tokens": 57176202.0, + "step": 2220 + }, + { + "epoch": 0.2439051175049418, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.595428943634033, + "learning_rate": 8.125915080527086e-07, + "loss": 0.9916, + "mean_token_accuracy": 0.7015664577484131, + "num_tokens": 57198532.0, + "step": 2221 + }, + { + "epoch": 0.24401493520755546, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.2471730709075928, + "learning_rate": 8.129575402635431e-07, + "loss": 1.046, + "mean_token_accuracy": 0.6900023818016052, + "num_tokens": 57228410.0, + "step": 2222 + }, + { + "epoch": 0.2441247529101691, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.1956844329833984, + "learning_rate": 8.133235724743777e-07, + "loss": 0.9954, + "mean_token_accuracy": 0.7012602686882019, + "num_tokens": 57257091.0, + "step": 2223 + }, + { + "epoch": 0.24423457061278278, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.2789742946624756, + "learning_rate": 8.136896046852123e-07, + "loss": 1.0065, + "mean_token_accuracy": 0.6964946389198303, + "num_tokens": 57282667.0, + "step": 2224 + }, + { + "epoch": 0.24434438831539645, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.6055290699005127, + "learning_rate": 8.140556368960468e-07, + "loss": 0.9559, + "mean_token_accuracy": 0.7106224298477173, + "num_tokens": 57301885.0, + "step": 2225 + }, + { + "epoch": 0.2444542060180101, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.776296854019165, + "learning_rate": 8.144216691068814e-07, + "loss": 1.043, + "mean_token_accuracy": 0.7010713815689087, + "num_tokens": 57322637.0, + "step": 2226 + }, + { + "epoch": 0.24456402372062377, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 2.6476690769195557, + "learning_rate": 8.147877013177159e-07, + "loss": 1.0823, + "mean_token_accuracy": 0.6860284805297852, + "num_tokens": 57346366.0, + "step": 2227 + }, + { + "epoch": 0.24467384142323742, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.406700611114502, + "learning_rate": 8.151537335285504e-07, + "loss": 1.0653, + "mean_token_accuracy": 0.6945481300354004, + "num_tokens": 57373168.0, + "step": 2228 + }, + { + "epoch": 0.2447836591258511, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.326575756072998, + "learning_rate": 8.155197657393851e-07, + "loss": 1.011, + "mean_token_accuracy": 0.6948019862174988, + "num_tokens": 57402621.0, + "step": 2229 + }, + { + "epoch": 0.24489347682846474, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.5827553272247314, + "learning_rate": 8.158857979502196e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.694116473197937, + "num_tokens": 57426230.0, + "step": 2230 + }, + { + "epoch": 0.2450032945310784, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.387660264968872, + "learning_rate": 8.162518301610542e-07, + "loss": 1.0084, + "mean_token_accuracy": 0.702528715133667, + "num_tokens": 57453725.0, + "step": 2231 + }, + { + "epoch": 0.24511311223369206, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.2008163928985596, + "learning_rate": 8.166178623718887e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6830896139144897, + "num_tokens": 57484306.0, + "step": 2232 + }, + { + "epoch": 0.24522292993630573, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.5408239364624023, + "learning_rate": 8.169838945827232e-07, + "loss": 1.0563, + "mean_token_accuracy": 0.690726637840271, + "num_tokens": 57510966.0, + "step": 2233 + }, + { + "epoch": 0.2453327476389194, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 2.290130376815796, + "learning_rate": 8.173499267935578e-07, + "loss": 1.0385, + "mean_token_accuracy": 0.6894041299819946, + "num_tokens": 57538533.0, + "step": 2234 + }, + { + "epoch": 0.24544256534153305, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.4154345989227295, + "learning_rate": 8.177159590043924e-07, + "loss": 1.0289, + "mean_token_accuracy": 0.690536618232727, + "num_tokens": 57564275.0, + "step": 2235 + }, + { + "epoch": 0.24555238304414673, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.147982120513916, + "learning_rate": 8.180819912152269e-07, + "loss": 1.0185, + "mean_token_accuracy": 0.6994647979736328, + "num_tokens": 57594837.0, + "step": 2236 + }, + { + "epoch": 0.24566220074676037, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.252432107925415, + "learning_rate": 8.184480234260615e-07, + "loss": 0.9935, + "mean_token_accuracy": 0.7126655578613281, + "num_tokens": 57620478.0, + "step": 2237 + }, + { + "epoch": 0.24577201844937405, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.3486320972442627, + "learning_rate": 8.18814055636896e-07, + "loss": 1.0925, + "mean_token_accuracy": 0.6826459765434265, + "num_tokens": 57647752.0, + "step": 2238 + }, + { + "epoch": 0.2458818361519877, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.4302597045898438, + "learning_rate": 8.191800878477305e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6989227533340454, + "num_tokens": 57672591.0, + "step": 2239 + }, + { + "epoch": 0.24599165385460137, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.4286718368530273, + "learning_rate": 8.195461200585652e-07, + "loss": 1.0002, + "mean_token_accuracy": 0.7010324001312256, + "num_tokens": 57696420.0, + "step": 2240 + }, + { + "epoch": 0.246101471557215, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.4776251316070557, + "learning_rate": 8.199121522693997e-07, + "loss": 1.0307, + "mean_token_accuracy": 0.6895335912704468, + "num_tokens": 57720386.0, + "step": 2241 + }, + { + "epoch": 0.2462112892598287, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.6540327072143555, + "learning_rate": 8.202781844802342e-07, + "loss": 1.0487, + "mean_token_accuracy": 0.693496584892273, + "num_tokens": 57746173.0, + "step": 2242 + }, + { + "epoch": 0.24632110696244233, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.492326259613037, + "learning_rate": 8.206442166910688e-07, + "loss": 1.0456, + "mean_token_accuracy": 0.6902428865432739, + "num_tokens": 57770085.0, + "step": 2243 + }, + { + "epoch": 0.246430924665056, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.1216869354248047, + "learning_rate": 8.210102489019033e-07, + "loss": 0.9571, + "mean_token_accuracy": 0.7090414762496948, + "num_tokens": 57799026.0, + "step": 2244 + }, + { + "epoch": 0.24654074236766968, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.392085313796997, + "learning_rate": 8.213762811127379e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.6939477920532227, + "num_tokens": 57824626.0, + "step": 2245 + }, + { + "epoch": 0.24665056007028333, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.6175625324249268, + "learning_rate": 8.217423133235725e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6815420389175415, + "num_tokens": 57847648.0, + "step": 2246 + }, + { + "epoch": 0.246760377772897, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.490961790084839, + "learning_rate": 8.22108345534407e-07, + "loss": 1.0492, + "mean_token_accuracy": 0.686745285987854, + "num_tokens": 57870239.0, + "step": 2247 + }, + { + "epoch": 0.24687019547551065, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.2102556228637695, + "learning_rate": 8.224743777452416e-07, + "loss": 1.0239, + "mean_token_accuracy": 0.6928597092628479, + "num_tokens": 57899702.0, + "step": 2248 + }, + { + "epoch": 0.24698001317812432, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.855466842651367, + "learning_rate": 8.228404099560761e-07, + "loss": 1.0025, + "mean_token_accuracy": 0.7036806344985962, + "num_tokens": 57918400.0, + "step": 2249 + }, + { + "epoch": 0.24708983088073797, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.546870708465576, + "learning_rate": 8.232064421669106e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.6936691999435425, + "num_tokens": 57940924.0, + "step": 2250 + }, + { + "epoch": 0.24719964858335164, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.7936675548553467, + "learning_rate": 8.235724743777453e-07, + "loss": 0.9541, + "mean_token_accuracy": 0.7184832692146301, + "num_tokens": 57959917.0, + "step": 2251 + }, + { + "epoch": 0.24730946628596528, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.1715924739837646, + "learning_rate": 8.239385065885798e-07, + "loss": 1.0786, + "mean_token_accuracy": 0.6883219480514526, + "num_tokens": 57989935.0, + "step": 2252 + }, + { + "epoch": 0.24741928398857896, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.70381760597229, + "learning_rate": 8.243045387994143e-07, + "loss": 0.9879, + "mean_token_accuracy": 0.7111453413963318, + "num_tokens": 58009541.0, + "step": 2253 + }, + { + "epoch": 0.24752910169119263, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.143989324569702, + "learning_rate": 8.246705710102489e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.6909226775169373, + "num_tokens": 58042087.0, + "step": 2254 + }, + { + "epoch": 0.24763891939380628, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.2607102394104004, + "learning_rate": 8.250366032210834e-07, + "loss": 0.9774, + "mean_token_accuracy": 0.7138672471046448, + "num_tokens": 58070764.0, + "step": 2255 + }, + { + "epoch": 0.24774873709641995, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 2.7868971824645996, + "learning_rate": 8.25402635431918e-07, + "loss": 0.9937, + "mean_token_accuracy": 0.7034626007080078, + "num_tokens": 58090387.0, + "step": 2256 + }, + { + "epoch": 0.2478585547990336, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.4476263523101807, + "learning_rate": 8.257686676427526e-07, + "loss": 1.0809, + "mean_token_accuracy": 0.6862469911575317, + "num_tokens": 58113755.0, + "step": 2257 + }, + { + "epoch": 0.24796837250164727, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.688509702682495, + "learning_rate": 8.261346998535871e-07, + "loss": 0.9592, + "mean_token_accuracy": 0.7160426378250122, + "num_tokens": 58135562.0, + "step": 2258 + }, + { + "epoch": 0.24807819020426092, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.4597702026367188, + "learning_rate": 8.265007320644216e-07, + "loss": 1.0692, + "mean_token_accuracy": 0.6812787055969238, + "num_tokens": 58161079.0, + "step": 2259 + }, + { + "epoch": 0.2481880079068746, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.398092746734619, + "learning_rate": 8.268667642752562e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.7168563604354858, + "num_tokens": 58186104.0, + "step": 2260 + }, + { + "epoch": 0.24829782560948824, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.048445701599121, + "learning_rate": 8.272327964860908e-07, + "loss": 1.1008, + "mean_token_accuracy": 0.677625298500061, + "num_tokens": 58221477.0, + "step": 2261 + }, + { + "epoch": 0.2484076433121019, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.304750680923462, + "learning_rate": 8.275988286969253e-07, + "loss": 1.0848, + "mean_token_accuracy": 0.6826578378677368, + "num_tokens": 58247976.0, + "step": 2262 + }, + { + "epoch": 0.24851746101471558, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 2.6990649700164795, + "learning_rate": 8.279648609077599e-07, + "loss": 0.9705, + "mean_token_accuracy": 0.7112194895744324, + "num_tokens": 58267331.0, + "step": 2263 + }, + { + "epoch": 0.24862727871732923, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.4108705520629883, + "learning_rate": 8.283308931185944e-07, + "loss": 1.0615, + "mean_token_accuracy": 0.6828076839447021, + "num_tokens": 58292582.0, + "step": 2264 + }, + { + "epoch": 0.2487370964199429, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.4529380798339844, + "learning_rate": 8.28696925329429e-07, + "loss": 1.1487, + "mean_token_accuracy": 0.6609886884689331, + "num_tokens": 58318884.0, + "step": 2265 + }, + { + "epoch": 0.24884691412255655, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.481902599334717, + "learning_rate": 8.290629575402635e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6904692053794861, + "num_tokens": 58344826.0, + "step": 2266 + }, + { + "epoch": 0.24895673182517022, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.396636486053467, + "learning_rate": 8.294289897510981e-07, + "loss": 1.041, + "mean_token_accuracy": 0.6896044015884399, + "num_tokens": 58368859.0, + "step": 2267 + }, + { + "epoch": 0.24906654952778387, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.3364720344543457, + "learning_rate": 8.297950219619327e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.7023586630821228, + "num_tokens": 58397377.0, + "step": 2268 + }, + { + "epoch": 0.24917636723039754, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.445895195007324, + "learning_rate": 8.301610541727672e-07, + "loss": 0.9043, + "mean_token_accuracy": 0.7303728461265564, + "num_tokens": 58419411.0, + "step": 2269 + }, + { + "epoch": 0.2492861849330112, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.2332043647766113, + "learning_rate": 8.305270863836017e-07, + "loss": 0.9908, + "mean_token_accuracy": 0.6997944116592407, + "num_tokens": 58447935.0, + "step": 2270 + }, + { + "epoch": 0.24939600263562486, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.317058563232422, + "learning_rate": 8.308931185944363e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6988658308982849, + "num_tokens": 58474769.0, + "step": 2271 + }, + { + "epoch": 0.24950582033823854, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.4740569591522217, + "learning_rate": 8.312591508052709e-07, + "loss": 1.154, + "mean_token_accuracy": 0.6603432893753052, + "num_tokens": 58504789.0, + "step": 2272 + }, + { + "epoch": 0.24961563804085218, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.874051094055176, + "learning_rate": 8.316251830161054e-07, + "loss": 0.9951, + "mean_token_accuracy": 0.6994088888168335, + "num_tokens": 58526700.0, + "step": 2273 + }, + { + "epoch": 0.24972545574346586, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.262437582015991, + "learning_rate": 8.3199121522694e-07, + "loss": 1.0452, + "mean_token_accuracy": 0.685011088848114, + "num_tokens": 58555555.0, + "step": 2274 + }, + { + "epoch": 0.2498352734460795, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.4843666553497314, + "learning_rate": 8.323572474377745e-07, + "loss": 0.8812, + "mean_token_accuracy": 0.7332584857940674, + "num_tokens": 58577667.0, + "step": 2275 + }, + { + "epoch": 0.24994509114869318, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.23762583732605, + "learning_rate": 8.32723279648609e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6918386220932007, + "num_tokens": 58606031.0, + "step": 2276 + }, + { + "epoch": 0.2500549088513068, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.3994219303131104, + "learning_rate": 8.330893118594437e-07, + "loss": 0.9918, + "mean_token_accuracy": 0.7037789821624756, + "num_tokens": 58631967.0, + "step": 2277 + }, + { + "epoch": 0.25016472655392047, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.2091927528381348, + "learning_rate": 8.334553440702782e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6844491362571716, + "num_tokens": 58662429.0, + "step": 2278 + }, + { + "epoch": 0.25027454425653417, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.3438358306884766, + "learning_rate": 8.338213762811127e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.673007607460022, + "num_tokens": 58689786.0, + "step": 2279 + }, + { + "epoch": 0.2503843619591478, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.5768866539001465, + "learning_rate": 8.341874084919473e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7014090418815613, + "num_tokens": 58711364.0, + "step": 2280 + }, + { + "epoch": 0.25049417966176146, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.3212249279022217, + "learning_rate": 8.345534407027818e-07, + "loss": 1.0597, + "mean_token_accuracy": 0.6876473426818848, + "num_tokens": 58737840.0, + "step": 2281 + }, + { + "epoch": 0.25060399736437516, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.3095109462738037, + "learning_rate": 8.349194729136164e-07, + "loss": 1.1132, + "mean_token_accuracy": 0.6768996119499207, + "num_tokens": 58766062.0, + "step": 2282 + }, + { + "epoch": 0.2507138150669888, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.4286694526672363, + "learning_rate": 8.35285505124451e-07, + "loss": 1.052, + "mean_token_accuracy": 0.6845513582229614, + "num_tokens": 58792625.0, + "step": 2283 + }, + { + "epoch": 0.25082363276960246, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.497283935546875, + "learning_rate": 8.356515373352855e-07, + "loss": 1.1724, + "mean_token_accuracy": 0.6618884801864624, + "num_tokens": 58817547.0, + "step": 2284 + }, + { + "epoch": 0.2509334504722161, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 6.960994720458984, + "learning_rate": 8.360175695461201e-07, + "loss": 1.0501, + "mean_token_accuracy": 0.6872842907905579, + "num_tokens": 58845967.0, + "step": 2285 + }, + { + "epoch": 0.2510432681748298, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.192798137664795, + "learning_rate": 8.363836017569546e-07, + "loss": 1.0386, + "mean_token_accuracy": 0.6946566104888916, + "num_tokens": 58876861.0, + "step": 2286 + }, + { + "epoch": 0.25115308587744345, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.566269874572754, + "learning_rate": 8.36749633967789e-07, + "loss": 1.0592, + "mean_token_accuracy": 0.6879284381866455, + "num_tokens": 58900848.0, + "step": 2287 + }, + { + "epoch": 0.2512629035800571, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.5018179416656494, + "learning_rate": 8.371156661786238e-07, + "loss": 0.9956, + "mean_token_accuracy": 0.7029820084571838, + "num_tokens": 58922927.0, + "step": 2288 + }, + { + "epoch": 0.25137272128267074, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.3044445514678955, + "learning_rate": 8.374816983894583e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6877326369285583, + "num_tokens": 58951123.0, + "step": 2289 + }, + { + "epoch": 0.25148253898528444, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.6625664234161377, + "learning_rate": 8.378477306002928e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.6852400302886963, + "num_tokens": 58970496.0, + "step": 2290 + }, + { + "epoch": 0.2515923566878981, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.314023733139038, + "learning_rate": 8.382137628111274e-07, + "loss": 1.0597, + "mean_token_accuracy": 0.6888284683227539, + "num_tokens": 58994989.0, + "step": 2291 + }, + { + "epoch": 0.25170217439051173, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.662463665008545, + "learning_rate": 8.385797950219619e-07, + "loss": 0.9832, + "mean_token_accuracy": 0.7045918703079224, + "num_tokens": 59015235.0, + "step": 2292 + }, + { + "epoch": 0.25181199209312544, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.3911402225494385, + "learning_rate": 8.389458272327965e-07, + "loss": 0.991, + "mean_token_accuracy": 0.6978827118873596, + "num_tokens": 59038421.0, + "step": 2293 + }, + { + "epoch": 0.2519218097957391, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.433379650115967, + "learning_rate": 8.393118594436311e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.6745830774307251, + "num_tokens": 59064410.0, + "step": 2294 + }, + { + "epoch": 0.25203162749835273, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 2.5116918087005615, + "learning_rate": 8.396778916544656e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6876948475837708, + "num_tokens": 59087245.0, + "step": 2295 + }, + { + "epoch": 0.2521414452009664, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.454353094100952, + "learning_rate": 8.400439238653e-07, + "loss": 1.0424, + "mean_token_accuracy": 0.689422070980072, + "num_tokens": 59111988.0, + "step": 2296 + }, + { + "epoch": 0.2522512629035801, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.5875332355499268, + "learning_rate": 8.404099560761346e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6970028877258301, + "num_tokens": 59137371.0, + "step": 2297 + }, + { + "epoch": 0.2523610806061937, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.393911123275757, + "learning_rate": 8.407759882869691e-07, + "loss": 1.0356, + "mean_token_accuracy": 0.6948807239532471, + "num_tokens": 59163108.0, + "step": 2298 + }, + { + "epoch": 0.25247089830880737, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.3083982467651367, + "learning_rate": 8.411420204978039e-07, + "loss": 1.0928, + "mean_token_accuracy": 0.6770353317260742, + "num_tokens": 59191057.0, + "step": 2299 + }, + { + "epoch": 0.252580716011421, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.229120969772339, + "learning_rate": 8.415080527086383e-07, + "loss": 1.1002, + "mean_token_accuracy": 0.6769025325775146, + "num_tokens": 59220690.0, + "step": 2300 + }, + { + "epoch": 0.2526905337140347, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.3005764484405518, + "learning_rate": 8.418740849194728e-07, + "loss": 1.0029, + "mean_token_accuracy": 0.7049754858016968, + "num_tokens": 59247310.0, + "step": 2301 + }, + { + "epoch": 0.25280035141664836, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 2.5361034870147705, + "learning_rate": 8.422401171303074e-07, + "loss": 1.0207, + "mean_token_accuracy": 0.6999274492263794, + "num_tokens": 59271797.0, + "step": 2302 + }, + { + "epoch": 0.252910169119262, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.3953733444213867, + "learning_rate": 8.426061493411419e-07, + "loss": 1.0104, + "mean_token_accuracy": 0.7005613446235657, + "num_tokens": 59299996.0, + "step": 2303 + }, + { + "epoch": 0.2530199868218757, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.330965518951416, + "learning_rate": 8.429721815519765e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.6802542209625244, + "num_tokens": 59326639.0, + "step": 2304 + }, + { + "epoch": 0.25312980452448935, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.3902995586395264, + "learning_rate": 8.433382137628111e-07, + "loss": 1.0243, + "mean_token_accuracy": 0.7020983695983887, + "num_tokens": 59353176.0, + "step": 2305 + }, + { + "epoch": 0.253239622227103, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.2000062465667725, + "learning_rate": 8.437042459736456e-07, + "loss": 1.031, + "mean_token_accuracy": 0.6968773603439331, + "num_tokens": 59381590.0, + "step": 2306 + }, + { + "epoch": 0.25334943992971665, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.0527825355529785, + "learning_rate": 8.440702781844801e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7042186260223389, + "num_tokens": 59411696.0, + "step": 2307 + }, + { + "epoch": 0.25345925763233035, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.33069109916687, + "learning_rate": 8.444363103953147e-07, + "loss": 1.066, + "mean_token_accuracy": 0.6790431141853333, + "num_tokens": 59438691.0, + "step": 2308 + }, + { + "epoch": 0.253569075334944, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.2133548259735107, + "learning_rate": 8.448023426061493e-07, + "loss": 1.0229, + "mean_token_accuracy": 0.6952004432678223, + "num_tokens": 59467636.0, + "step": 2309 + }, + { + "epoch": 0.25367889303755764, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.241438388824463, + "learning_rate": 8.451683748169838e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.6887688636779785, + "num_tokens": 59496548.0, + "step": 2310 + }, + { + "epoch": 0.25378871074017134, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.5706403255462646, + "learning_rate": 8.455344070278184e-07, + "loss": 1.0555, + "mean_token_accuracy": 0.6885865926742554, + "num_tokens": 59518194.0, + "step": 2311 + }, + { + "epoch": 0.253898528442785, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.5432240962982178, + "learning_rate": 8.459004392386529e-07, + "loss": 0.9263, + "mean_token_accuracy": 0.7200496792793274, + "num_tokens": 59540317.0, + "step": 2312 + }, + { + "epoch": 0.25400834614539863, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.3367695808410645, + "learning_rate": 8.462664714494874e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.672964334487915, + "num_tokens": 59569101.0, + "step": 2313 + }, + { + "epoch": 0.2541181638480123, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.4327237606048584, + "learning_rate": 8.46632503660322e-07, + "loss": 1.0816, + "mean_token_accuracy": 0.6799023151397705, + "num_tokens": 59597709.0, + "step": 2314 + }, + { + "epoch": 0.254227981550626, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.238896131515503, + "learning_rate": 8.469985358711566e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.6932125687599182, + "num_tokens": 59626571.0, + "step": 2315 + }, + { + "epoch": 0.2543377992532396, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.411656379699707, + "learning_rate": 8.473645680819912e-07, + "loss": 1.0361, + "mean_token_accuracy": 0.685150682926178, + "num_tokens": 59653507.0, + "step": 2316 + }, + { + "epoch": 0.2544476169558533, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 2.3969290256500244, + "learning_rate": 8.477306002928257e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7093112468719482, + "num_tokens": 59682791.0, + "step": 2317 + }, + { + "epoch": 0.2545574346584669, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.5263051986694336, + "learning_rate": 8.480966325036602e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.6695300936698914, + "num_tokens": 59708535.0, + "step": 2318 + }, + { + "epoch": 0.2546672523610806, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.8587875366210938, + "learning_rate": 8.484626647144948e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.7032554149627686, + "num_tokens": 59728949.0, + "step": 2319 + }, + { + "epoch": 0.25477707006369427, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.5343310832977295, + "learning_rate": 8.488286969253294e-07, + "loss": 0.9934, + "mean_token_accuracy": 0.7086228132247925, + "num_tokens": 59750730.0, + "step": 2320 + }, + { + "epoch": 0.2548868877663079, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.454212188720703, + "learning_rate": 8.491947291361639e-07, + "loss": 1.0248, + "mean_token_accuracy": 0.6974421739578247, + "num_tokens": 59773449.0, + "step": 2321 + }, + { + "epoch": 0.2549967054689216, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.4435529708862305, + "learning_rate": 8.495607613469985e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6821587681770325, + "num_tokens": 59797987.0, + "step": 2322 + }, + { + "epoch": 0.25510652317153526, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.627023458480835, + "learning_rate": 8.49926793557833e-07, + "loss": 0.9851, + "mean_token_accuracy": 0.6990402936935425, + "num_tokens": 59819138.0, + "step": 2323 + }, + { + "epoch": 0.2552163408741489, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.5188632011413574, + "learning_rate": 8.502928257686675e-07, + "loss": 1.081, + "mean_token_accuracy": 0.689713716506958, + "num_tokens": 59842604.0, + "step": 2324 + }, + { + "epoch": 0.25532615857676255, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.2823259830474854, + "learning_rate": 8.506588579795022e-07, + "loss": 1.0464, + "mean_token_accuracy": 0.6911396980285645, + "num_tokens": 59869413.0, + "step": 2325 + }, + { + "epoch": 0.25543597627937625, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.3923890590667725, + "learning_rate": 8.510248901903367e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.6852656602859497, + "num_tokens": 59896911.0, + "step": 2326 + }, + { + "epoch": 0.2555457939819899, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.497739791870117, + "learning_rate": 8.513909224011712e-07, + "loss": 0.9871, + "mean_token_accuracy": 0.7038193941116333, + "num_tokens": 59920209.0, + "step": 2327 + }, + { + "epoch": 0.25565561168460355, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.460930347442627, + "learning_rate": 8.517569546120058e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6816325783729553, + "num_tokens": 59946626.0, + "step": 2328 + }, + { + "epoch": 0.25576542938721725, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.2625555992126465, + "learning_rate": 8.521229868228403e-07, + "loss": 1.0724, + "mean_token_accuracy": 0.6885888576507568, + "num_tokens": 59975887.0, + "step": 2329 + }, + { + "epoch": 0.2558752470898309, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.3511743545532227, + "learning_rate": 8.524890190336748e-07, + "loss": 0.9587, + "mean_token_accuracy": 0.7124102115631104, + "num_tokens": 60000691.0, + "step": 2330 + }, + { + "epoch": 0.25598506479244454, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.406829357147217, + "learning_rate": 8.528550512445095e-07, + "loss": 1.0347, + "mean_token_accuracy": 0.6965658068656921, + "num_tokens": 60025315.0, + "step": 2331 + }, + { + "epoch": 0.2560948824950582, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.289053440093994, + "learning_rate": 8.53221083455344e-07, + "loss": 1.1063, + "mean_token_accuracy": 0.6862890720367432, + "num_tokens": 60057120.0, + "step": 2332 + }, + { + "epoch": 0.2562047001976719, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.2876055240631104, + "learning_rate": 8.535871156661786e-07, + "loss": 1.0403, + "mean_token_accuracy": 0.698716402053833, + "num_tokens": 60084156.0, + "step": 2333 + }, + { + "epoch": 0.25631451790028553, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.4269800186157227, + "learning_rate": 8.539531478770131e-07, + "loss": 1.0954, + "mean_token_accuracy": 0.6800997257232666, + "num_tokens": 60109948.0, + "step": 2334 + }, + { + "epoch": 0.2564243356028992, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.740713357925415, + "learning_rate": 8.543191800878476e-07, + "loss": 0.9536, + "mean_token_accuracy": 0.7084532976150513, + "num_tokens": 60129571.0, + "step": 2335 + }, + { + "epoch": 0.2565341533055128, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.336111307144165, + "learning_rate": 8.546852122986823e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.682891845703125, + "num_tokens": 60157568.0, + "step": 2336 + }, + { + "epoch": 0.2566439710081265, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.386636257171631, + "learning_rate": 8.550512445095168e-07, + "loss": 1.0171, + "mean_token_accuracy": 0.6962590217590332, + "num_tokens": 60185034.0, + "step": 2337 + }, + { + "epoch": 0.25675378871074017, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.0782110691070557, + "learning_rate": 8.554172767203513e-07, + "loss": 1.046, + "mean_token_accuracy": 0.6917262673377991, + "num_tokens": 60218705.0, + "step": 2338 + }, + { + "epoch": 0.2568636064133538, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.302563428878784, + "learning_rate": 8.557833089311859e-07, + "loss": 0.9558, + "mean_token_accuracy": 0.7160061597824097, + "num_tokens": 60243522.0, + "step": 2339 + }, + { + "epoch": 0.2569734241159675, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 2.2830898761749268, + "learning_rate": 8.561493411420204e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6988584399223328, + "num_tokens": 60271606.0, + "step": 2340 + }, + { + "epoch": 0.25708324181858117, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.2081589698791504, + "learning_rate": 8.56515373352855e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.6916338801383972, + "num_tokens": 60300332.0, + "step": 2341 + }, + { + "epoch": 0.2571930595211948, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.2209863662719727, + "learning_rate": 8.568814055636896e-07, + "loss": 0.9143, + "mean_token_accuracy": 0.726434051990509, + "num_tokens": 60328449.0, + "step": 2342 + }, + { + "epoch": 0.25730287722380846, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.295189142227173, + "learning_rate": 8.572474377745241e-07, + "loss": 1.0294, + "mean_token_accuracy": 0.6984816789627075, + "num_tokens": 60354910.0, + "step": 2343 + }, + { + "epoch": 0.25741269492642216, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.374840021133423, + "learning_rate": 8.576134699853586e-07, + "loss": 1.0093, + "mean_token_accuracy": 0.7037403583526611, + "num_tokens": 60380063.0, + "step": 2344 + }, + { + "epoch": 0.2575225126290358, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.525238037109375, + "learning_rate": 8.579795021961932e-07, + "loss": 1.0647, + "mean_token_accuracy": 0.6863220930099487, + "num_tokens": 60404706.0, + "step": 2345 + }, + { + "epoch": 0.25763233033164945, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.2578556537628174, + "learning_rate": 8.583455344070277e-07, + "loss": 1.0142, + "mean_token_accuracy": 0.7007431387901306, + "num_tokens": 60433576.0, + "step": 2346 + }, + { + "epoch": 0.2577421480342631, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.2836499214172363, + "learning_rate": 8.587115666178624e-07, + "loss": 1.0187, + "mean_token_accuracy": 0.6965731978416443, + "num_tokens": 60458975.0, + "step": 2347 + }, + { + "epoch": 0.2578519657368768, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.1427628993988037, + "learning_rate": 8.590775988286969e-07, + "loss": 1.0341, + "mean_token_accuracy": 0.7038811445236206, + "num_tokens": 60490043.0, + "step": 2348 + }, + { + "epoch": 0.25796178343949044, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.4637773036956787, + "learning_rate": 8.594436310395314e-07, + "loss": 1.0529, + "mean_token_accuracy": 0.686892569065094, + "num_tokens": 60512267.0, + "step": 2349 + }, + { + "epoch": 0.2580716011421041, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.3705356121063232, + "learning_rate": 8.59809663250366e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.6893740892410278, + "num_tokens": 60537869.0, + "step": 2350 + }, + { + "epoch": 0.2581814188447178, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.351146697998047, + "learning_rate": 8.601756954612005e-07, + "loss": 1.0243, + "mean_token_accuracy": 0.6986782550811768, + "num_tokens": 60563603.0, + "step": 2351 + }, + { + "epoch": 0.25829123654733144, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.527076482772827, + "learning_rate": 8.605417276720351e-07, + "loss": 1.0303, + "mean_token_accuracy": 0.691522479057312, + "num_tokens": 60587192.0, + "step": 2352 + }, + { + "epoch": 0.2584010542499451, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.460662364959717, + "learning_rate": 8.609077598828697e-07, + "loss": 0.98, + "mean_token_accuracy": 0.7023266553878784, + "num_tokens": 60610638.0, + "step": 2353 + }, + { + "epoch": 0.25851087195255873, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 2.523030996322632, + "learning_rate": 8.612737920937042e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6877152919769287, + "num_tokens": 60636598.0, + "step": 2354 + }, + { + "epoch": 0.25862068965517243, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.182454824447632, + "learning_rate": 8.616398243045387e-07, + "loss": 1.0775, + "mean_token_accuracy": 0.6855370402336121, + "num_tokens": 60666496.0, + "step": 2355 + }, + { + "epoch": 0.2587305073577861, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.152503490447998, + "learning_rate": 8.620058565153733e-07, + "loss": 1.0846, + "mean_token_accuracy": 0.6806139945983887, + "num_tokens": 60696995.0, + "step": 2356 + }, + { + "epoch": 0.2588403250603997, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.6216788291931152, + "learning_rate": 8.623718887262079e-07, + "loss": 0.9779, + "mean_token_accuracy": 0.7114546298980713, + "num_tokens": 60717345.0, + "step": 2357 + }, + { + "epoch": 0.2589501427630134, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.3943512439727783, + "learning_rate": 8.627379209370424e-07, + "loss": 1.0737, + "mean_token_accuracy": 0.6932008266448975, + "num_tokens": 60744639.0, + "step": 2358 + }, + { + "epoch": 0.25905996046562707, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.1788523197174072, + "learning_rate": 8.63103953147877e-07, + "loss": 1.1038, + "mean_token_accuracy": 0.6808695793151855, + "num_tokens": 60773675.0, + "step": 2359 + }, + { + "epoch": 0.2591697781682407, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.3226633071899414, + "learning_rate": 8.634699853587115e-07, + "loss": 1.0289, + "mean_token_accuracy": 0.6954525113105774, + "num_tokens": 60799408.0, + "step": 2360 + }, + { + "epoch": 0.25927959587085436, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.5846495628356934, + "learning_rate": 8.63836017569546e-07, + "loss": 1.0798, + "mean_token_accuracy": 0.6829979419708252, + "num_tokens": 60823031.0, + "step": 2361 + }, + { + "epoch": 0.25938941357346806, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 2.2970893383026123, + "learning_rate": 8.642020497803806e-07, + "loss": 1.0556, + "mean_token_accuracy": 0.6915290355682373, + "num_tokens": 60851807.0, + "step": 2362 + }, + { + "epoch": 0.2594992312760817, + "ewc_loss": 6.496906280517578e-06, + "grad_norm": 3.0987839698791504, + "learning_rate": 8.645680819912152e-07, + "loss": 1.0811, + "mean_token_accuracy": 0.6853134632110596, + "num_tokens": 60871805.0, + "step": 2363 + }, + { + "epoch": 0.25960904897869536, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.060086488723755, + "learning_rate": 8.649341142020498e-07, + "loss": 1.1778, + "mean_token_accuracy": 0.6555240750312805, + "num_tokens": 60909599.0, + "step": 2364 + }, + { + "epoch": 0.259718866681309, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2843539714813232, + "learning_rate": 8.653001464128843e-07, + "loss": 1.0947, + "mean_token_accuracy": 0.6835613250732422, + "num_tokens": 60936870.0, + "step": 2365 + }, + { + "epoch": 0.2598286843839227, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.667731285095215, + "learning_rate": 8.656661786237188e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.6964309215545654, + "num_tokens": 60958742.0, + "step": 2366 + }, + { + "epoch": 0.25993850208653635, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.3103911876678467, + "learning_rate": 8.660322108345534e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6947721242904663, + "num_tokens": 60987581.0, + "step": 2367 + }, + { + "epoch": 0.26004831978915, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.4760324954986572, + "learning_rate": 8.66398243045388e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6805105209350586, + "num_tokens": 61015388.0, + "step": 2368 + }, + { + "epoch": 0.2601581374917637, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2143185138702393, + "learning_rate": 8.667642752562225e-07, + "loss": 1.0758, + "mean_token_accuracy": 0.6844462156295776, + "num_tokens": 61046490.0, + "step": 2369 + }, + { + "epoch": 0.26026795519437734, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.305933713912964, + "learning_rate": 8.671303074670571e-07, + "loss": 1.1251, + "mean_token_accuracy": 0.6715884208679199, + "num_tokens": 61076806.0, + "step": 2370 + }, + { + "epoch": 0.260377772896991, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.102733612060547, + "learning_rate": 8.674963396778916e-07, + "loss": 1.1276, + "mean_token_accuracy": 0.6760658025741577, + "num_tokens": 61109820.0, + "step": 2371 + }, + { + "epoch": 0.26048759059960463, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.4450645446777344, + "learning_rate": 8.678623718887261e-07, + "loss": 1.0093, + "mean_token_accuracy": 0.6975848078727722, + "num_tokens": 61131765.0, + "step": 2372 + }, + { + "epoch": 0.26059740830221834, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.330479621887207, + "learning_rate": 8.682284040995608e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.6993956565856934, + "num_tokens": 61160748.0, + "step": 2373 + }, + { + "epoch": 0.260707226004832, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.350496530532837, + "learning_rate": 8.685944363103953e-07, + "loss": 1.0152, + "mean_token_accuracy": 0.7028970122337341, + "num_tokens": 61189525.0, + "step": 2374 + }, + { + "epoch": 0.26081704370744563, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.3962554931640625, + "learning_rate": 8.689604685212298e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6933459043502808, + "num_tokens": 61217544.0, + "step": 2375 + }, + { + "epoch": 0.2609268614100593, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.308616876602173, + "learning_rate": 8.693265007320644e-07, + "loss": 1.0323, + "mean_token_accuracy": 0.6931240558624268, + "num_tokens": 61244972.0, + "step": 2376 + }, + { + "epoch": 0.261036679112673, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2994208335876465, + "learning_rate": 8.696925329428989e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6709691286087036, + "num_tokens": 61272586.0, + "step": 2377 + }, + { + "epoch": 0.2611464968152866, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.5104546546936035, + "learning_rate": 8.700585651537334e-07, + "loss": 0.9089, + "mean_token_accuracy": 0.7229135036468506, + "num_tokens": 61294173.0, + "step": 2378 + }, + { + "epoch": 0.26125631451790027, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.2898426055908203, + "learning_rate": 8.704245973645681e-07, + "loss": 1.0594, + "mean_token_accuracy": 0.6895331740379333, + "num_tokens": 61321813.0, + "step": 2379 + }, + { + "epoch": 0.26136613222051397, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 2.310698986053467, + "learning_rate": 8.707906295754026e-07, + "loss": 1.1765, + "mean_token_accuracy": 0.6570535898208618, + "num_tokens": 61351003.0, + "step": 2380 + }, + { + "epoch": 0.2614759499231276, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.46167254447937, + "learning_rate": 8.711566617862372e-07, + "loss": 1.0979, + "mean_token_accuracy": 0.6823468208312988, + "num_tokens": 61376842.0, + "step": 2381 + }, + { + "epoch": 0.26158576762574126, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1748740673065186, + "learning_rate": 8.715226939970717e-07, + "loss": 1.037, + "mean_token_accuracy": 0.6944220066070557, + "num_tokens": 61409980.0, + "step": 2382 + }, + { + "epoch": 0.2616955853283549, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1403770446777344, + "learning_rate": 8.718887262079062e-07, + "loss": 1.0797, + "mean_token_accuracy": 0.6876910328865051, + "num_tokens": 61442720.0, + "step": 2383 + }, + { + "epoch": 0.2618054030309686, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.431570291519165, + "learning_rate": 8.722547584187409e-07, + "loss": 1.0136, + "mean_token_accuracy": 0.7009159326553345, + "num_tokens": 61468322.0, + "step": 2384 + }, + { + "epoch": 0.26191522073358225, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.6035659313201904, + "learning_rate": 8.726207906295754e-07, + "loss": 1.0425, + "mean_token_accuracy": 0.6847246289253235, + "num_tokens": 61490210.0, + "step": 2385 + }, + { + "epoch": 0.2620250384361959, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.5315566062927246, + "learning_rate": 8.729868228404099e-07, + "loss": 1.0408, + "mean_token_accuracy": 0.6881383061408997, + "num_tokens": 61513905.0, + "step": 2386 + }, + { + "epoch": 0.2621348561388096, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.6673638820648193, + "learning_rate": 8.733528550512445e-07, + "loss": 1.0302, + "mean_token_accuracy": 0.6901518106460571, + "num_tokens": 61533984.0, + "step": 2387 + }, + { + "epoch": 0.26224467384142325, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.370582103729248, + "learning_rate": 8.73718887262079e-07, + "loss": 0.9955, + "mean_token_accuracy": 0.7070118188858032, + "num_tokens": 61559133.0, + "step": 2388 + }, + { + "epoch": 0.2623544915440369, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.4242594242095947, + "learning_rate": 8.740849194729136e-07, + "loss": 1.0153, + "mean_token_accuracy": 0.7000166177749634, + "num_tokens": 61583791.0, + "step": 2389 + }, + { + "epoch": 0.26246430924665054, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.6796491146087646, + "learning_rate": 8.744509516837482e-07, + "loss": 0.9945, + "mean_token_accuracy": 0.7060163021087646, + "num_tokens": 61604249.0, + "step": 2390 + }, + { + "epoch": 0.26257412694926424, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.5882034301757812, + "learning_rate": 8.748169838945827e-07, + "loss": 1.0024, + "mean_token_accuracy": 0.7055046558380127, + "num_tokens": 61627052.0, + "step": 2391 + }, + { + "epoch": 0.2626839446518779, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.220257520675659, + "learning_rate": 8.751830161054172e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.682323157787323, + "num_tokens": 61655979.0, + "step": 2392 + }, + { + "epoch": 0.26279376235449153, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.5344178676605225, + "learning_rate": 8.755490483162518e-07, + "loss": 1.1216, + "mean_token_accuracy": 0.6674572825431824, + "num_tokens": 61684326.0, + "step": 2393 + }, + { + "epoch": 0.2629035800571052, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.4113662242889404, + "learning_rate": 8.759150805270863e-07, + "loss": 1.0959, + "mean_token_accuracy": 0.6780531406402588, + "num_tokens": 61712239.0, + "step": 2394 + }, + { + "epoch": 0.2630133977597189, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.3816120624542236, + "learning_rate": 8.76281112737921e-07, + "loss": 1.0596, + "mean_token_accuracy": 0.6892017126083374, + "num_tokens": 61740619.0, + "step": 2395 + }, + { + "epoch": 0.2631232154623325, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.396796703338623, + "learning_rate": 8.766471449487555e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6795458793640137, + "num_tokens": 61766457.0, + "step": 2396 + }, + { + "epoch": 0.2632330331649462, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.4022624492645264, + "learning_rate": 8.7701317715959e-07, + "loss": 1.0479, + "mean_token_accuracy": 0.6948560476303101, + "num_tokens": 61792109.0, + "step": 2397 + }, + { + "epoch": 0.2633428508675599, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.196361541748047, + "learning_rate": 8.773792093704246e-07, + "loss": 1.0226, + "mean_token_accuracy": 0.6959666013717651, + "num_tokens": 61821867.0, + "step": 2398 + }, + { + "epoch": 0.2634526685701735, + "ewc_loss": 6.556510925292969e-06, + "grad_norm": 2.1574223041534424, + "learning_rate": 8.777452415812591e-07, + "loss": 0.9984, + "mean_token_accuracy": 0.7028728127479553, + "num_tokens": 61851076.0, + "step": 2399 + }, + { + "epoch": 0.26356248627278717, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 3.7463557720184326, + "learning_rate": 8.781112737920937e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.698840320110321, + "num_tokens": 61883955.0, + "step": 2400 + }, + { + "epoch": 0.2636723039754008, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.2889585494995117, + "learning_rate": 8.784773060029283e-07, + "loss": 0.9417, + "mean_token_accuracy": 0.7137198448181152, + "num_tokens": 61909733.0, + "step": 2401 + }, + { + "epoch": 0.2637821216780145, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.3286452293395996, + "learning_rate": 8.788433382137628e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.6884894371032715, + "num_tokens": 61937771.0, + "step": 2402 + }, + { + "epoch": 0.26389193938062816, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.7334210872650146, + "learning_rate": 8.792093704245973e-07, + "loss": 1.0176, + "mean_token_accuracy": 0.6973518133163452, + "num_tokens": 61958207.0, + "step": 2403 + }, + { + "epoch": 0.2640017570832418, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.255061626434326, + "learning_rate": 8.795754026354319e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6797558069229126, + "num_tokens": 61987806.0, + "step": 2404 + }, + { + "epoch": 0.2641115747858555, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.181140422821045, + "learning_rate": 8.799414348462665e-07, + "loss": 1.1309, + "mean_token_accuracy": 0.6696490049362183, + "num_tokens": 62019461.0, + "step": 2405 + }, + { + "epoch": 0.26422139248846915, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.1722891330718994, + "learning_rate": 8.80307467057101e-07, + "loss": 1.1271, + "mean_token_accuracy": 0.6730019450187683, + "num_tokens": 62052648.0, + "step": 2406 + }, + { + "epoch": 0.2643312101910828, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.4780147075653076, + "learning_rate": 8.806734992679356e-07, + "loss": 1.0374, + "mean_token_accuracy": 0.6987388134002686, + "num_tokens": 62077698.0, + "step": 2407 + }, + { + "epoch": 0.26444102789369645, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.324237108230591, + "learning_rate": 8.810395314787701e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6955321431159973, + "num_tokens": 62105774.0, + "step": 2408 + }, + { + "epoch": 0.26455084559631015, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.589890956878662, + "learning_rate": 8.814055636896046e-07, + "loss": 1.0443, + "mean_token_accuracy": 0.6905312538146973, + "num_tokens": 62128899.0, + "step": 2409 + }, + { + "epoch": 0.2646606632989238, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.533609628677368, + "learning_rate": 8.817715959004392e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.6761497259140015, + "num_tokens": 62152471.0, + "step": 2410 + }, + { + "epoch": 0.26477048100153744, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.730226516723633, + "learning_rate": 8.821376281112738e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.6907006502151489, + "num_tokens": 62174310.0, + "step": 2411 + }, + { + "epoch": 0.2648802987041511, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 2.757680892944336, + "learning_rate": 8.825036603221084e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.7063805460929871, + "num_tokens": 62193956.0, + "step": 2412 + }, + { + "epoch": 0.2649901164067648, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 3.822922468185425, + "learning_rate": 8.828696925329429e-07, + "loss": 1.0707, + "mean_token_accuracy": 0.6787898540496826, + "num_tokens": 62220723.0, + "step": 2413 + }, + { + "epoch": 0.26509993410937843, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.434927463531494, + "learning_rate": 8.832357247437774e-07, + "loss": 1.0634, + "mean_token_accuracy": 0.6878933906555176, + "num_tokens": 62246768.0, + "step": 2414 + }, + { + "epoch": 0.2652097518119921, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.290491819381714, + "learning_rate": 8.83601756954612e-07, + "loss": 0.9956, + "mean_token_accuracy": 0.7066907286643982, + "num_tokens": 62274476.0, + "step": 2415 + }, + { + "epoch": 0.2653195695146058, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.2396819591522217, + "learning_rate": 8.839677891654466e-07, + "loss": 0.9903, + "mean_token_accuracy": 0.7071049213409424, + "num_tokens": 62300250.0, + "step": 2416 + }, + { + "epoch": 0.2654293872172194, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.3078255653381348, + "learning_rate": 8.843338213762811e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7101093530654907, + "num_tokens": 62325482.0, + "step": 2417 + }, + { + "epoch": 0.26553920491983307, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.611631393432617, + "learning_rate": 8.846998535871157e-07, + "loss": 1.0518, + "mean_token_accuracy": 0.694212794303894, + "num_tokens": 62348385.0, + "step": 2418 + }, + { + "epoch": 0.2656490226224467, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.4566421508789062, + "learning_rate": 8.850658857979502e-07, + "loss": 1.0125, + "mean_token_accuracy": 0.6997066736221313, + "num_tokens": 62371942.0, + "step": 2419 + }, + { + "epoch": 0.2657588403250604, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.455685615539551, + "learning_rate": 8.854319180087847e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.7039054036140442, + "num_tokens": 62398163.0, + "step": 2420 + }, + { + "epoch": 0.26586865802767407, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.856051445007324, + "learning_rate": 8.857979502196194e-07, + "loss": 1.0467, + "mean_token_accuracy": 0.6844320297241211, + "num_tokens": 62419047.0, + "step": 2421 + }, + { + "epoch": 0.2659784757302877, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.2977447509765625, + "learning_rate": 8.861639824304539e-07, + "loss": 1.1324, + "mean_token_accuracy": 0.6802611351013184, + "num_tokens": 62446956.0, + "step": 2422 + }, + { + "epoch": 0.26608829343290136, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.5531885623931885, + "learning_rate": 8.865300146412884e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.689060389995575, + "num_tokens": 62468806.0, + "step": 2423 + }, + { + "epoch": 0.26619811113551506, + "ewc_loss": 6.616115570068359e-06, + "grad_norm": 2.4148788452148438, + "learning_rate": 8.86896046852123e-07, + "loss": 1.073, + "mean_token_accuracy": 0.6867728233337402, + "num_tokens": 62493696.0, + "step": 2424 + }, + { + "epoch": 0.2663079288381287, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 2.9965286254882812, + "learning_rate": 8.872620790629575e-07, + "loss": 0.9425, + "mean_token_accuracy": 0.7137151956558228, + "num_tokens": 62510545.0, + "step": 2425 + }, + { + "epoch": 0.26641774654074235, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 2.2461133003234863, + "learning_rate": 8.87628111273792e-07, + "loss": 1.0294, + "mean_token_accuracy": 0.7011202573776245, + "num_tokens": 62538602.0, + "step": 2426 + }, + { + "epoch": 0.26652756424335605, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 2.958650827407837, + "learning_rate": 8.879941434846267e-07, + "loss": 0.9416, + "mean_token_accuracy": 0.7152718901634216, + "num_tokens": 62556858.0, + "step": 2427 + }, + { + "epoch": 0.2666373819459697, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 2.436318874359131, + "learning_rate": 8.883601756954612e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.7003315091133118, + "num_tokens": 62580368.0, + "step": 2428 + }, + { + "epoch": 0.26674719964858334, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.3555455207824707, + "learning_rate": 8.887262079062958e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.6960349082946777, + "num_tokens": 62605952.0, + "step": 2429 + }, + { + "epoch": 0.266857017351197, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.222398519515991, + "learning_rate": 8.890922401171303e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6864455938339233, + "num_tokens": 62635903.0, + "step": 2430 + }, + { + "epoch": 0.2669668350538107, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.068176746368408, + "learning_rate": 8.894582723279648e-07, + "loss": 1.1256, + "mean_token_accuracy": 0.6675643920898438, + "num_tokens": 62675543.0, + "step": 2431 + }, + { + "epoch": 0.26707665275642434, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.2227649688720703, + "learning_rate": 8.898243045387995e-07, + "loss": 1.0965, + "mean_token_accuracy": 0.6745836734771729, + "num_tokens": 62705144.0, + "step": 2432 + }, + { + "epoch": 0.267186470459038, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.4475626945495605, + "learning_rate": 8.90190336749634e-07, + "loss": 1.0237, + "mean_token_accuracy": 0.6931136250495911, + "num_tokens": 62730740.0, + "step": 2433 + }, + { + "epoch": 0.2672962881616517, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.4183828830718994, + "learning_rate": 8.905563689604685e-07, + "loss": 1.08, + "mean_token_accuracy": 0.68328857421875, + "num_tokens": 62755879.0, + "step": 2434 + }, + { + "epoch": 0.26740610586426533, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.1845412254333496, + "learning_rate": 8.909224011713031e-07, + "loss": 1.0131, + "mean_token_accuracy": 0.6999561786651611, + "num_tokens": 62786905.0, + "step": 2435 + }, + { + "epoch": 0.267515923566879, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.187045097351074, + "learning_rate": 8.912884333821376e-07, + "loss": 1.0584, + "mean_token_accuracy": 0.6905941367149353, + "num_tokens": 62815251.0, + "step": 2436 + }, + { + "epoch": 0.2676257412694926, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.406048536300659, + "learning_rate": 8.916544655929722e-07, + "loss": 0.997, + "mean_token_accuracy": 0.711561918258667, + "num_tokens": 62840071.0, + "step": 2437 + }, + { + "epoch": 0.2677355589721063, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 2.4614083766937256, + "learning_rate": 8.920204978038068e-07, + "loss": 0.9614, + "mean_token_accuracy": 0.7103568911552429, + "num_tokens": 62863599.0, + "step": 2438 + }, + { + "epoch": 0.26784537667471997, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.4654054641723633, + "learning_rate": 8.923865300146413e-07, + "loss": 1.0063, + "mean_token_accuracy": 0.6999257206916809, + "num_tokens": 62886063.0, + "step": 2439 + }, + { + "epoch": 0.2679551943773336, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.3953335285186768, + "learning_rate": 8.927525622254757e-07, + "loss": 1.031, + "mean_token_accuracy": 0.7028942108154297, + "num_tokens": 62911701.0, + "step": 2440 + }, + { + "epoch": 0.26806501207994726, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.7024078369140625, + "learning_rate": 8.931185944363104e-07, + "loss": 0.9811, + "mean_token_accuracy": 0.7067464590072632, + "num_tokens": 62931389.0, + "step": 2441 + }, + { + "epoch": 0.26817482978256096, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.385138750076294, + "learning_rate": 8.934846266471448e-07, + "loss": 1.1099, + "mean_token_accuracy": 0.6738148331642151, + "num_tokens": 62959298.0, + "step": 2442 + }, + { + "epoch": 0.2682846474851746, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.2320938110351562, + "learning_rate": 8.938506588579794e-07, + "loss": 1.0102, + "mean_token_accuracy": 0.706573486328125, + "num_tokens": 62987228.0, + "step": 2443 + }, + { + "epoch": 0.26839446518778826, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.1791772842407227, + "learning_rate": 8.94216691068814e-07, + "loss": 1.0177, + "mean_token_accuracy": 0.6936444044113159, + "num_tokens": 63016848.0, + "step": 2444 + }, + { + "epoch": 0.26850428289040196, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.2937746047973633, + "learning_rate": 8.945827232796485e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6751808524131775, + "num_tokens": 63047314.0, + "step": 2445 + }, + { + "epoch": 0.2686141005930156, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.1565587520599365, + "learning_rate": 8.949487554904832e-07, + "loss": 1.1239, + "mean_token_accuracy": 0.671558141708374, + "num_tokens": 63081054.0, + "step": 2446 + }, + { + "epoch": 0.26872391829562925, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.5526511669158936, + "learning_rate": 8.953147877013176e-07, + "loss": 1.0261, + "mean_token_accuracy": 0.6961714625358582, + "num_tokens": 63102763.0, + "step": 2447 + }, + { + "epoch": 0.2688337359982429, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.6343889236450195, + "learning_rate": 8.956808199121522e-07, + "loss": 1.0328, + "mean_token_accuracy": 0.6934819221496582, + "num_tokens": 63124068.0, + "step": 2448 + }, + { + "epoch": 0.2689435537008566, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 2.458786725997925, + "learning_rate": 8.960468521229869e-07, + "loss": 0.9825, + "mean_token_accuracy": 0.7071743011474609, + "num_tokens": 63145050.0, + "step": 2449 + }, + { + "epoch": 0.26905337140347024, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.5499465465545654, + "learning_rate": 8.964128843338213e-07, + "loss": 0.9927, + "mean_token_accuracy": 0.712051272392273, + "num_tokens": 63167810.0, + "step": 2450 + }, + { + "epoch": 0.2691631891060839, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.296452522277832, + "learning_rate": 8.967789165446558e-07, + "loss": 1.0035, + "mean_token_accuracy": 0.7036582231521606, + "num_tokens": 63197274.0, + "step": 2451 + }, + { + "epoch": 0.26927300680869753, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.4144845008850098, + "learning_rate": 8.971449487554904e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.685832142829895, + "num_tokens": 63221700.0, + "step": 2452 + }, + { + "epoch": 0.26938282451131124, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.4104197025299072, + "learning_rate": 8.97510980966325e-07, + "loss": 1.0778, + "mean_token_accuracy": 0.6801242828369141, + "num_tokens": 63248117.0, + "step": 2453 + }, + { + "epoch": 0.2694926422139249, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.5081586837768555, + "learning_rate": 8.978770131771595e-07, + "loss": 0.9432, + "mean_token_accuracy": 0.7159920930862427, + "num_tokens": 63271257.0, + "step": 2454 + }, + { + "epoch": 0.26960245991653853, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.5962109565734863, + "learning_rate": 8.982430453879941e-07, + "loss": 1.0399, + "mean_token_accuracy": 0.6880397796630859, + "num_tokens": 63293129.0, + "step": 2455 + }, + { + "epoch": 0.26971227761915223, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.678495407104492, + "learning_rate": 8.986090775988286e-07, + "loss": 1.0239, + "mean_token_accuracy": 0.6971922516822815, + "num_tokens": 63315767.0, + "step": 2456 + }, + { + "epoch": 0.2698220953217659, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.2797279357910156, + "learning_rate": 8.989751098096631e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6950170397758484, + "num_tokens": 63345045.0, + "step": 2457 + }, + { + "epoch": 0.2699319130243795, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 2.383366584777832, + "learning_rate": 8.993411420204977e-07, + "loss": 1.0137, + "mean_token_accuracy": 0.6963866353034973, + "num_tokens": 63370935.0, + "step": 2458 + }, + { + "epoch": 0.27004173072699317, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.5126936435699463, + "learning_rate": 8.997071742313323e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.687142014503479, + "num_tokens": 63395023.0, + "step": 2459 + }, + { + "epoch": 0.27015154842960687, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.1168506145477295, + "learning_rate": 9.000732064421668e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.6731691956520081, + "num_tokens": 63429774.0, + "step": 2460 + }, + { + "epoch": 0.2702613661322205, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.3037984371185303, + "learning_rate": 9.004392386530014e-07, + "loss": 0.9755, + "mean_token_accuracy": 0.7061091661453247, + "num_tokens": 63454711.0, + "step": 2461 + }, + { + "epoch": 0.27037118383483416, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.3687233924865723, + "learning_rate": 9.008052708638359e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.6978473663330078, + "num_tokens": 63481256.0, + "step": 2462 + }, + { + "epoch": 0.27048100153744786, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 2.630368947982788, + "learning_rate": 9.011713030746705e-07, + "loss": 1.0748, + "mean_token_accuracy": 0.6836726665496826, + "num_tokens": 63507253.0, + "step": 2463 + }, + { + "epoch": 0.2705908192400615, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.4259908199310303, + "learning_rate": 9.015373352855051e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.6869105100631714, + "num_tokens": 63532003.0, + "step": 2464 + }, + { + "epoch": 0.27070063694267515, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.5598385334014893, + "learning_rate": 9.019033674963396e-07, + "loss": 1.0098, + "mean_token_accuracy": 0.6972121000289917, + "num_tokens": 63554003.0, + "step": 2465 + }, + { + "epoch": 0.2708104546452888, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.6319386959075928, + "learning_rate": 9.022693997071742e-07, + "loss": 0.9032, + "mean_token_accuracy": 0.7256460785865784, + "num_tokens": 63574058.0, + "step": 2466 + }, + { + "epoch": 0.2709202723479025, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.654247999191284, + "learning_rate": 9.026354319180087e-07, + "loss": 0.9813, + "mean_token_accuracy": 0.7046439051628113, + "num_tokens": 63596234.0, + "step": 2467 + }, + { + "epoch": 0.27103009005051615, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.162102699279785, + "learning_rate": 9.030014641288432e-07, + "loss": 1.0753, + "mean_token_accuracy": 0.6845399141311646, + "num_tokens": 63627837.0, + "step": 2468 + }, + { + "epoch": 0.2711399077531298, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.1661159992218018, + "learning_rate": 9.033674963396779e-07, + "loss": 1.1418, + "mean_token_accuracy": 0.6668282747268677, + "num_tokens": 63661751.0, + "step": 2469 + }, + { + "epoch": 0.27124972545574344, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.521420478820801, + "learning_rate": 9.037335285505124e-07, + "loss": 1.0612, + "mean_token_accuracy": 0.6901969909667969, + "num_tokens": 63686230.0, + "step": 2470 + }, + { + "epoch": 0.27135954315835714, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.5200910568237305, + "learning_rate": 9.040995607613469e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6719202995300293, + "num_tokens": 63710974.0, + "step": 2471 + }, + { + "epoch": 0.2714693608609708, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.250256299972534, + "learning_rate": 9.044655929721815e-07, + "loss": 1.1035, + "mean_token_accuracy": 0.6745588183403015, + "num_tokens": 63741057.0, + "step": 2472 + }, + { + "epoch": 0.27157917856358443, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.443284749984741, + "learning_rate": 9.04831625183016e-07, + "loss": 1.1046, + "mean_token_accuracy": 0.6766387224197388, + "num_tokens": 63766625.0, + "step": 2473 + }, + { + "epoch": 0.27168899626619814, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 2.2501914501190186, + "learning_rate": 9.051976573938505e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6804614067077637, + "num_tokens": 63792679.0, + "step": 2474 + }, + { + "epoch": 0.2717988139688118, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.3570282459259033, + "learning_rate": 9.055636896046852e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.6894978880882263, + "num_tokens": 63818259.0, + "step": 2475 + }, + { + "epoch": 0.2719086316714254, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.554440975189209, + "learning_rate": 9.059297218155197e-07, + "loss": 0.9895, + "mean_token_accuracy": 0.7014214396476746, + "num_tokens": 63839630.0, + "step": 2476 + }, + { + "epoch": 0.2720184493740391, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.1653757095336914, + "learning_rate": 9.062957540263542e-07, + "loss": 0.9992, + "mean_token_accuracy": 0.7014999389648438, + "num_tokens": 63868019.0, + "step": 2477 + }, + { + "epoch": 0.2721282670766528, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.1259546279907227, + "learning_rate": 9.066617862371888e-07, + "loss": 1.0348, + "mean_token_accuracy": 0.7005065679550171, + "num_tokens": 63896304.0, + "step": 2478 + }, + { + "epoch": 0.2722380847792664, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.291616439819336, + "learning_rate": 9.070278184480233e-07, + "loss": 0.9667, + "mean_token_accuracy": 0.706994891166687, + "num_tokens": 63925436.0, + "step": 2479 + }, + { + "epoch": 0.27234790248188007, + "ewc_loss": 6.854534149169922e-06, + "grad_norm": 2.412344217300415, + "learning_rate": 9.07393850658858e-07, + "loss": 1.0169, + "mean_token_accuracy": 0.6980688571929932, + "num_tokens": 63949487.0, + "step": 2480 + }, + { + "epoch": 0.27245772018449377, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.3831236362457275, + "learning_rate": 9.077598828696925e-07, + "loss": 0.9452, + "mean_token_accuracy": 0.7218904495239258, + "num_tokens": 63972779.0, + "step": 2481 + }, + { + "epoch": 0.2725675378871074, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 3.010152578353882, + "learning_rate": 9.08125915080527e-07, + "loss": 1.0109, + "mean_token_accuracy": 0.699666440486908, + "num_tokens": 63990229.0, + "step": 2482 + }, + { + "epoch": 0.27267735558972106, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 2.5293025970458984, + "learning_rate": 9.084919472913616e-07, + "loss": 1.1056, + "mean_token_accuracy": 0.6723036766052246, + "num_tokens": 64014964.0, + "step": 2483 + }, + { + "epoch": 0.2727871732923347, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.308962106704712, + "learning_rate": 9.088579795021961e-07, + "loss": 1.0115, + "mean_token_accuracy": 0.6975375413894653, + "num_tokens": 64041383.0, + "step": 2484 + }, + { + "epoch": 0.2728969909949484, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.3950629234313965, + "learning_rate": 9.092240117130307e-07, + "loss": 1.0767, + "mean_token_accuracy": 0.6798577904701233, + "num_tokens": 64070705.0, + "step": 2485 + }, + { + "epoch": 0.27300680869756205, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.2825231552124023, + "learning_rate": 9.095900439238653e-07, + "loss": 0.9393, + "mean_token_accuracy": 0.7134646773338318, + "num_tokens": 64098527.0, + "step": 2486 + }, + { + "epoch": 0.2731166264001757, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 6.952721118927002, + "learning_rate": 9.099560761346998e-07, + "loss": 0.9924, + "mean_token_accuracy": 0.6995919346809387, + "num_tokens": 64129915.0, + "step": 2487 + }, + { + "epoch": 0.27322644410278935, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.564575433731079, + "learning_rate": 9.103221083455343e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.6896824836730957, + "num_tokens": 64154983.0, + "step": 2488 + }, + { + "epoch": 0.27333626180540305, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.2503697872161865, + "learning_rate": 9.106881405563689e-07, + "loss": 1.1596, + "mean_token_accuracy": 0.6725138425827026, + "num_tokens": 64186278.0, + "step": 2489 + }, + { + "epoch": 0.2734460795080167, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.4422383308410645, + "learning_rate": 9.110541727672034e-07, + "loss": 1.0587, + "mean_token_accuracy": 0.6881017088890076, + "num_tokens": 64211406.0, + "step": 2490 + }, + { + "epoch": 0.27355589721063034, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.4347951412200928, + "learning_rate": 9.11420204978038e-07, + "loss": 0.9708, + "mean_token_accuracy": 0.7094969749450684, + "num_tokens": 64236208.0, + "step": 2491 + }, + { + "epoch": 0.27366571491324404, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.172428607940674, + "learning_rate": 9.117862371888726e-07, + "loss": 1.1387, + "mean_token_accuracy": 0.6666780710220337, + "num_tokens": 64269029.0, + "step": 2492 + }, + { + "epoch": 0.2737755326158577, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.4816246032714844, + "learning_rate": 9.121522693997071e-07, + "loss": 1.1021, + "mean_token_accuracy": 0.6697753071784973, + "num_tokens": 64295333.0, + "step": 2493 + }, + { + "epoch": 0.27388535031847133, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.2481465339660645, + "learning_rate": 9.125183016105416e-07, + "loss": 1.0604, + "mean_token_accuracy": 0.6842597723007202, + "num_tokens": 64325395.0, + "step": 2494 + }, + { + "epoch": 0.273995168021085, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.2947916984558105, + "learning_rate": 9.128843338213762e-07, + "loss": 1.0495, + "mean_token_accuracy": 0.6860611438751221, + "num_tokens": 64353858.0, + "step": 2495 + }, + { + "epoch": 0.2741049857236987, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.439772844314575, + "learning_rate": 9.132503660322108e-07, + "loss": 1.0731, + "mean_token_accuracy": 0.681024432182312, + "num_tokens": 64378320.0, + "step": 2496 + }, + { + "epoch": 0.2742148034263123, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 2.470395088195801, + "learning_rate": 9.136163982430454e-07, + "loss": 0.9755, + "mean_token_accuracy": 0.7098333835601807, + "num_tokens": 64402520.0, + "step": 2497 + }, + { + "epoch": 0.27432462112892597, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.4302453994750977, + "learning_rate": 9.139824304538799e-07, + "loss": 0.927, + "mean_token_accuracy": 0.7212991714477539, + "num_tokens": 64424227.0, + "step": 2498 + }, + { + "epoch": 0.2744344388315396, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.519434928894043, + "learning_rate": 9.143484626647144e-07, + "loss": 1.1332, + "mean_token_accuracy": 0.6734990477561951, + "num_tokens": 64449016.0, + "step": 2499 + }, + { + "epoch": 0.2745442565341533, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.5217087268829346, + "learning_rate": 9.14714494875549e-07, + "loss": 0.8778, + "mean_token_accuracy": 0.7297985553741455, + "num_tokens": 64471600.0, + "step": 2500 + }, + { + "epoch": 0.27465407423676697, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.2653250694274902, + "learning_rate": 9.150805270863836e-07, + "loss": 1.112, + "mean_token_accuracy": 0.6788760423660278, + "num_tokens": 64501456.0, + "step": 2501 + }, + { + "epoch": 0.2747638919393806, + "ewc_loss": 7.0035457611083984e-06, + "grad_norm": 2.594970464706421, + "learning_rate": 9.154465592972181e-07, + "loss": 1.0284, + "mean_token_accuracy": 0.6992946267127991, + "num_tokens": 64525896.0, + "step": 2502 + }, + { + "epoch": 0.2748737096419943, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1866438388824463, + "learning_rate": 9.158125915080527e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.6852067112922668, + "num_tokens": 64556903.0, + "step": 2503 + }, + { + "epoch": 0.27498352734460796, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.2127559185028076, + "learning_rate": 9.161786237188872e-07, + "loss": 1.0763, + "mean_token_accuracy": 0.6797981262207031, + "num_tokens": 64586040.0, + "step": 2504 + }, + { + "epoch": 0.2750933450472216, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.3696234226226807, + "learning_rate": 9.165446559297217e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6803340911865234, + "num_tokens": 64611675.0, + "step": 2505 + }, + { + "epoch": 0.27520316274983525, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.363492250442505, + "learning_rate": 9.169106881405563e-07, + "loss": 0.963, + "mean_token_accuracy": 0.7151016592979431, + "num_tokens": 64637574.0, + "step": 2506 + }, + { + "epoch": 0.27531298045244895, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.5438270568847656, + "learning_rate": 9.172767203513909e-07, + "loss": 0.9052, + "mean_token_accuracy": 0.7251468896865845, + "num_tokens": 64659812.0, + "step": 2507 + }, + { + "epoch": 0.2754227981550626, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.590874433517456, + "learning_rate": 9.176427525622254e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.6769031286239624, + "num_tokens": 64683019.0, + "step": 2508 + }, + { + "epoch": 0.27553261585767624, + "ewc_loss": 6.943941116333008e-06, + "grad_norm": 2.43335223197937, + "learning_rate": 9.1800878477306e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.695327639579773, + "num_tokens": 64708468.0, + "step": 2509 + }, + { + "epoch": 0.27564243356028995, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.135103464126587, + "learning_rate": 9.183748169838945e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6789318323135376, + "num_tokens": 64740161.0, + "step": 2510 + }, + { + "epoch": 0.2757522512629036, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.6498818397521973, + "learning_rate": 9.18740849194729e-07, + "loss": 0.9846, + "mean_token_accuracy": 0.7058448791503906, + "num_tokens": 64759273.0, + "step": 2511 + }, + { + "epoch": 0.27586206896551724, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1753485202789307, + "learning_rate": 9.191068814055637e-07, + "loss": 1.0115, + "mean_token_accuracy": 0.6983669996261597, + "num_tokens": 64788926.0, + "step": 2512 + }, + { + "epoch": 0.2759718866681309, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.326906204223633, + "learning_rate": 9.194729136163982e-07, + "loss": 1.068, + "mean_token_accuracy": 0.6872776746749878, + "num_tokens": 64817940.0, + "step": 2513 + }, + { + "epoch": 0.2760817043707446, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.0507166385650635, + "learning_rate": 9.198389458272328e-07, + "loss": 1.1512, + "mean_token_accuracy": 0.6780908107757568, + "num_tokens": 64855373.0, + "step": 2514 + }, + { + "epoch": 0.27619152207335823, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.3374454975128174, + "learning_rate": 9.202049780380673e-07, + "loss": 1.0237, + "mean_token_accuracy": 0.6913691759109497, + "num_tokens": 64881957.0, + "step": 2515 + }, + { + "epoch": 0.2763013397759719, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.263355016708374, + "learning_rate": 9.205710102489018e-07, + "loss": 0.9732, + "mean_token_accuracy": 0.7068639397621155, + "num_tokens": 64908585.0, + "step": 2516 + }, + { + "epoch": 0.2764111574785855, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.620222568511963, + "learning_rate": 9.209370424597365e-07, + "loss": 0.9834, + "mean_token_accuracy": 0.7053042650222778, + "num_tokens": 64930242.0, + "step": 2517 + }, + { + "epoch": 0.2765209751811992, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.112088680267334, + "learning_rate": 9.21303074670571e-07, + "loss": 0.9989, + "mean_token_accuracy": 0.7003909945487976, + "num_tokens": 64959196.0, + "step": 2518 + }, + { + "epoch": 0.27663079288381287, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.307029962539673, + "learning_rate": 9.216691068814055e-07, + "loss": 1.0303, + "mean_token_accuracy": 0.6906285881996155, + "num_tokens": 64987422.0, + "step": 2519 + }, + { + "epoch": 0.2767406105864265, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.2807509899139404, + "learning_rate": 9.220351390922401e-07, + "loss": 1.1088, + "mean_token_accuracy": 0.6813809871673584, + "num_tokens": 65015218.0, + "step": 2520 + }, + { + "epoch": 0.2768504282890402, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.3553311824798584, + "learning_rate": 9.224011713030746e-07, + "loss": 1.0338, + "mean_token_accuracy": 0.6972222328186035, + "num_tokens": 65042379.0, + "step": 2521 + }, + { + "epoch": 0.27696024599165386, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.346787929534912, + "learning_rate": 9.227672035139091e-07, + "loss": 1.0597, + "mean_token_accuracy": 0.6859641671180725, + "num_tokens": 65069290.0, + "step": 2522 + }, + { + "epoch": 0.2770700636942675, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.303706645965576, + "learning_rate": 9.231332357247438e-07, + "loss": 0.9985, + "mean_token_accuracy": 0.6970326900482178, + "num_tokens": 65094108.0, + "step": 2523 + }, + { + "epoch": 0.27717988139688116, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.1732821464538574, + "learning_rate": 9.234992679355783e-07, + "loss": 0.9902, + "mean_token_accuracy": 0.7064506411552429, + "num_tokens": 65122212.0, + "step": 2524 + }, + { + "epoch": 0.27728969909949486, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.224907159805298, + "learning_rate": 9.238653001464128e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6837852001190186, + "num_tokens": 65153331.0, + "step": 2525 + }, + { + "epoch": 0.2773995168021085, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.3415019512176514, + "learning_rate": 9.242313323572474e-07, + "loss": 1.002, + "mean_token_accuracy": 0.7035840153694153, + "num_tokens": 65178441.0, + "step": 2526 + }, + { + "epoch": 0.27750933450472215, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 2.17392897605896, + "learning_rate": 9.245973645680819e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7079892158508301, + "num_tokens": 65207757.0, + "step": 2527 + }, + { + "epoch": 0.2776191522073358, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.3290600776672363, + "learning_rate": 9.249633967789166e-07, + "loss": 1.0821, + "mean_token_accuracy": 0.6880391836166382, + "num_tokens": 65236719.0, + "step": 2528 + }, + { + "epoch": 0.2777289699099495, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.339012861251831, + "learning_rate": 9.253294289897511e-07, + "loss": 0.9491, + "mean_token_accuracy": 0.7156119346618652, + "num_tokens": 65260911.0, + "step": 2529 + }, + { + "epoch": 0.27783878761256314, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.698192596435547, + "learning_rate": 9.256954612005856e-07, + "loss": 0.9696, + "mean_token_accuracy": 0.7086411118507385, + "num_tokens": 65281523.0, + "step": 2530 + }, + { + "epoch": 0.2779486053151768, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 2.494565010070801, + "learning_rate": 9.260614934114202e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6896604895591736, + "num_tokens": 65304936.0, + "step": 2531 + }, + { + "epoch": 0.2780584230177905, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.2623484134674072, + "learning_rate": 9.264275256222547e-07, + "loss": 0.9104, + "mean_token_accuracy": 0.7221536040306091, + "num_tokens": 65329771.0, + "step": 2532 + }, + { + "epoch": 0.27816824072040414, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.5148773193359375, + "learning_rate": 9.267935578330893e-07, + "loss": 1.1224, + "mean_token_accuracy": 0.6750246286392212, + "num_tokens": 65352739.0, + "step": 2533 + }, + { + "epoch": 0.2782780584230178, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.258408308029175, + "learning_rate": 9.271595900439239e-07, + "loss": 1.068, + "mean_token_accuracy": 0.6968870759010315, + "num_tokens": 65381469.0, + "step": 2534 + }, + { + "epoch": 0.27838787612563143, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.8067750930786133, + "learning_rate": 9.275256222547584e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.709775447845459, + "num_tokens": 65402894.0, + "step": 2535 + }, + { + "epoch": 0.27849769382824513, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.252211809158325, + "learning_rate": 9.278916544655929e-07, + "loss": 1.109, + "mean_token_accuracy": 0.682852566242218, + "num_tokens": 65433634.0, + "step": 2536 + }, + { + "epoch": 0.2786075115308588, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.798273801803589, + "learning_rate": 9.282576866764275e-07, + "loss": 0.9083, + "mean_token_accuracy": 0.7229964733123779, + "num_tokens": 65452164.0, + "step": 2537 + }, + { + "epoch": 0.2787173292334724, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.288482189178467, + "learning_rate": 9.28623718887262e-07, + "loss": 1.0924, + "mean_token_accuracy": 0.6867024302482605, + "num_tokens": 65478967.0, + "step": 2538 + }, + { + "epoch": 0.2788271469360861, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.693490982055664, + "learning_rate": 9.289897510980966e-07, + "loss": 0.9656, + "mean_token_accuracy": 0.7101354002952576, + "num_tokens": 65499590.0, + "step": 2539 + }, + { + "epoch": 0.27893696463869977, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.549984931945801, + "learning_rate": 9.293557833089312e-07, + "loss": 0.9791, + "mean_token_accuracy": 0.705079197883606, + "num_tokens": 65525619.0, + "step": 2540 + }, + { + "epoch": 0.2790467823413134, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.601914644241333, + "learning_rate": 9.297218155197657e-07, + "loss": 0.9904, + "mean_token_accuracy": 0.7051715850830078, + "num_tokens": 65547328.0, + "step": 2541 + }, + { + "epoch": 0.27915660004392706, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.356678009033203, + "learning_rate": 9.300878477306002e-07, + "loss": 1.0992, + "mean_token_accuracy": 0.6755271553993225, + "num_tokens": 65574613.0, + "step": 2542 + }, + { + "epoch": 0.27926641774654076, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.1940386295318604, + "learning_rate": 9.304538799414348e-07, + "loss": 0.9793, + "mean_token_accuracy": 0.7015875577926636, + "num_tokens": 65603136.0, + "step": 2543 + }, + { + "epoch": 0.2793762354491544, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.360536813735962, + "learning_rate": 9.308199121522694e-07, + "loss": 1.0191, + "mean_token_accuracy": 0.6974397897720337, + "num_tokens": 65628228.0, + "step": 2544 + }, + { + "epoch": 0.27948605315176805, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.649104118347168, + "learning_rate": 9.31185944363104e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.6881983876228333, + "num_tokens": 65655181.0, + "step": 2545 + }, + { + "epoch": 0.2795958708543817, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.231238842010498, + "learning_rate": 9.315519765739385e-07, + "loss": 1.1342, + "mean_token_accuracy": 0.6718875169754028, + "num_tokens": 65684935.0, + "step": 2546 + }, + { + "epoch": 0.2797056885569954, + "ewc_loss": 7.092952728271484e-06, + "grad_norm": 2.546915054321289, + "learning_rate": 9.31918008784773e-07, + "loss": 1.0192, + "mean_token_accuracy": 0.6935510635375977, + "num_tokens": 65708943.0, + "step": 2547 + }, + { + "epoch": 0.27981550625960905, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.215686082839966, + "learning_rate": 9.322840409956076e-07, + "loss": 1.0046, + "mean_token_accuracy": 0.7075321674346924, + "num_tokens": 65737123.0, + "step": 2548 + }, + { + "epoch": 0.2799253239622227, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.4742963314056396, + "learning_rate": 9.326500732064422e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6950505971908569, + "num_tokens": 65760927.0, + "step": 2549 + }, + { + "epoch": 0.2800351416648364, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.2625832557678223, + "learning_rate": 9.330161054172767e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.687757134437561, + "num_tokens": 65788313.0, + "step": 2550 + }, + { + "epoch": 0.28014495936745004, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.8242721557617188, + "learning_rate": 9.333821376281113e-07, + "loss": 1.0364, + "mean_token_accuracy": 0.6905431151390076, + "num_tokens": 65812857.0, + "step": 2551 + }, + { + "epoch": 0.2802547770700637, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.4050822257995605, + "learning_rate": 9.337481698389458e-07, + "loss": 0.9959, + "mean_token_accuracy": 0.7065736055374146, + "num_tokens": 65836955.0, + "step": 2552 + }, + { + "epoch": 0.28036459477267733, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.3401193618774414, + "learning_rate": 9.341142020497803e-07, + "loss": 1.0262, + "mean_token_accuracy": 0.6960378289222717, + "num_tokens": 65863603.0, + "step": 2553 + }, + { + "epoch": 0.28047441247529104, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.480990171432495, + "learning_rate": 9.344802342606149e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6915377974510193, + "num_tokens": 65890120.0, + "step": 2554 + }, + { + "epoch": 0.2805842301779047, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.068330764770508, + "learning_rate": 9.348462664714495e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.6951122283935547, + "num_tokens": 65926236.0, + "step": 2555 + }, + { + "epoch": 0.2806940478805183, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.545186996459961, + "learning_rate": 9.35212298682284e-07, + "loss": 0.9702, + "mean_token_accuracy": 0.7090930938720703, + "num_tokens": 65947643.0, + "step": 2556 + }, + { + "epoch": 0.28080386558313203, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.51155948638916, + "learning_rate": 9.355783308931186e-07, + "loss": 0.9951, + "mean_token_accuracy": 0.7087917327880859, + "num_tokens": 65970667.0, + "step": 2557 + }, + { + "epoch": 0.2809136832857457, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.3493971824645996, + "learning_rate": 9.359443631039531e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6867068409919739, + "num_tokens": 65996991.0, + "step": 2558 + }, + { + "epoch": 0.2810235009883593, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.2123565673828125, + "learning_rate": 9.363103953147876e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6917329430580139, + "num_tokens": 66027766.0, + "step": 2559 + }, + { + "epoch": 0.28113331869097297, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.8004491329193115, + "learning_rate": 9.366764275256223e-07, + "loss": 0.9396, + "mean_token_accuracy": 0.7136078476905823, + "num_tokens": 66046417.0, + "step": 2560 + }, + { + "epoch": 0.28124313639358667, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.35270094871521, + "learning_rate": 9.370424597364568e-07, + "loss": 0.9838, + "mean_token_accuracy": 0.7059974670410156, + "num_tokens": 66073856.0, + "step": 2561 + }, + { + "epoch": 0.2813529540962003, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.3532824516296387, + "learning_rate": 9.374084919472914e-07, + "loss": 1.0246, + "mean_token_accuracy": 0.6982065439224243, + "num_tokens": 66101429.0, + "step": 2562 + }, + { + "epoch": 0.28146277179881396, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.2859463691711426, + "learning_rate": 9.377745241581259e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6704699993133545, + "num_tokens": 66129613.0, + "step": 2563 + }, + { + "epoch": 0.2815725895014276, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.2070233821868896, + "learning_rate": 9.381405563689604e-07, + "loss": 1.0431, + "mean_token_accuracy": 0.6885768175125122, + "num_tokens": 66159666.0, + "step": 2564 + }, + { + "epoch": 0.2816824072040413, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.6951053142547607, + "learning_rate": 9.385065885797951e-07, + "loss": 1.0152, + "mean_token_accuracy": 0.6987028121948242, + "num_tokens": 66181171.0, + "step": 2565 + }, + { + "epoch": 0.28179222490665495, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.410305976867676, + "learning_rate": 9.388726207906296e-07, + "loss": 0.9505, + "mean_token_accuracy": 0.7195159792900085, + "num_tokens": 66205013.0, + "step": 2566 + }, + { + "epoch": 0.2819020426092686, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.411017417907715, + "learning_rate": 9.392386530014641e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.7018423080444336, + "num_tokens": 66230664.0, + "step": 2567 + }, + { + "epoch": 0.2820118603118823, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.2513859272003174, + "learning_rate": 9.396046852122987e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.6984727382659912, + "num_tokens": 66260812.0, + "step": 2568 + }, + { + "epoch": 0.28212167801449595, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.5618855953216553, + "learning_rate": 9.399707174231332e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6855651140213013, + "num_tokens": 66282638.0, + "step": 2569 + }, + { + "epoch": 0.2822314957171096, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.0920605659484863, + "learning_rate": 9.403367496339677e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.6864367127418518, + "num_tokens": 66315689.0, + "step": 2570 + }, + { + "epoch": 0.28234131341972324, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.2296340465545654, + "learning_rate": 9.407027818448024e-07, + "loss": 0.9855, + "mean_token_accuracy": 0.7064635753631592, + "num_tokens": 66343772.0, + "step": 2571 + }, + { + "epoch": 0.28245113112233694, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.3508360385894775, + "learning_rate": 9.410688140556369e-07, + "loss": 0.9188, + "mean_token_accuracy": 0.7198394536972046, + "num_tokens": 66366792.0, + "step": 2572 + }, + { + "epoch": 0.2825609488249506, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.5231778621673584, + "learning_rate": 9.414348462664714e-07, + "loss": 0.988, + "mean_token_accuracy": 0.7024373412132263, + "num_tokens": 66389908.0, + "step": 2573 + }, + { + "epoch": 0.28267076652756423, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.199768304824829, + "learning_rate": 9.41800878477306e-07, + "loss": 1.072, + "mean_token_accuracy": 0.6871733665466309, + "num_tokens": 66423245.0, + "step": 2574 + }, + { + "epoch": 0.2827805842301779, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.4080469608306885, + "learning_rate": 9.421669106881405e-07, + "loss": 1.101, + "mean_token_accuracy": 0.6730087399482727, + "num_tokens": 66453024.0, + "step": 2575 + }, + { + "epoch": 0.2828904019327916, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.318293571472168, + "learning_rate": 9.425329428989752e-07, + "loss": 1.036, + "mean_token_accuracy": 0.6934233903884888, + "num_tokens": 66479736.0, + "step": 2576 + }, + { + "epoch": 0.2830002196354052, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.203706979751587, + "learning_rate": 9.428989751098097e-07, + "loss": 1.0777, + "mean_token_accuracy": 0.6781125068664551, + "num_tokens": 66509498.0, + "step": 2577 + }, + { + "epoch": 0.28311003733801887, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.425203561782837, + "learning_rate": 9.432650073206442e-07, + "loss": 1.0313, + "mean_token_accuracy": 0.695460319519043, + "num_tokens": 66534943.0, + "step": 2578 + }, + { + "epoch": 0.2832198550406326, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 2.297560453414917, + "learning_rate": 9.436310395314788e-07, + "loss": 1.0167, + "mean_token_accuracy": 0.6970136165618896, + "num_tokens": 66561088.0, + "step": 2579 + }, + { + "epoch": 0.2833296727432462, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 2.371102809906006, + "learning_rate": 9.439970717423133e-07, + "loss": 0.8992, + "mean_token_accuracy": 0.7290869951248169, + "num_tokens": 66584705.0, + "step": 2580 + }, + { + "epoch": 0.28343949044585987, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 2.221721887588501, + "learning_rate": 9.443631039531479e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6805645227432251, + "num_tokens": 66617156.0, + "step": 2581 + }, + { + "epoch": 0.2835493081484735, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 2.4417147636413574, + "learning_rate": 9.447291361639825e-07, + "loss": 1.1003, + "mean_token_accuracy": 0.6724937558174133, + "num_tokens": 66644407.0, + "step": 2582 + }, + { + "epoch": 0.2836591258510872, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.2873635292053223, + "learning_rate": 9.45095168374817e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6756237745285034, + "num_tokens": 66680089.0, + "step": 2583 + }, + { + "epoch": 0.28376894355370086, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.052612543106079, + "learning_rate": 9.454612005856515e-07, + "loss": 1.0415, + "mean_token_accuracy": 0.6915295124053955, + "num_tokens": 66716109.0, + "step": 2584 + }, + { + "epoch": 0.2838787612563145, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.461359739303589, + "learning_rate": 9.458272327964861e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6916950941085815, + "num_tokens": 66740383.0, + "step": 2585 + }, + { + "epoch": 0.2839885789589282, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.020930528640747, + "learning_rate": 9.461932650073206e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6915479898452759, + "num_tokens": 66774475.0, + "step": 2586 + }, + { + "epoch": 0.28409839666154185, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.2027978897094727, + "learning_rate": 9.465592972181552e-07, + "loss": 0.9528, + "mean_token_accuracy": 0.7167491912841797, + "num_tokens": 66802774.0, + "step": 2587 + }, + { + "epoch": 0.2842082143641555, + "ewc_loss": 7.241964340209961e-06, + "grad_norm": 2.748507022857666, + "learning_rate": 9.469253294289898e-07, + "loss": 0.9982, + "mean_token_accuracy": 0.6967338919639587, + "num_tokens": 66822965.0, + "step": 2588 + }, + { + "epoch": 0.28431803206676914, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.3188977241516113, + "learning_rate": 9.472913616398243e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6903601884841919, + "num_tokens": 66852734.0, + "step": 2589 + }, + { + "epoch": 0.28442784976938285, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.369957447052002, + "learning_rate": 9.476573938506587e-07, + "loss": 0.9514, + "mean_token_accuracy": 0.7187988758087158, + "num_tokens": 66877006.0, + "step": 2590 + }, + { + "epoch": 0.2845376674719965, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.457637071609497, + "learning_rate": 9.480234260614933e-07, + "loss": 1.0753, + "mean_token_accuracy": 0.6881956458091736, + "num_tokens": 66902300.0, + "step": 2591 + }, + { + "epoch": 0.28464748517461014, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.2855827808380127, + "learning_rate": 9.48389458272328e-07, + "loss": 0.9459, + "mean_token_accuracy": 0.7185667157173157, + "num_tokens": 66930991.0, + "step": 2592 + }, + { + "epoch": 0.2847573028772238, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.3952112197875977, + "learning_rate": 9.487554904831626e-07, + "loss": 1.0406, + "mean_token_accuracy": 0.6949256658554077, + "num_tokens": 66957004.0, + "step": 2593 + }, + { + "epoch": 0.2848671205798375, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 2.716083288192749, + "learning_rate": 9.49121522693997e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.7019253969192505, + "num_tokens": 66980931.0, + "step": 2594 + }, + { + "epoch": 0.28497693828245113, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.700208902359009, + "learning_rate": 9.494875549048315e-07, + "loss": 1.0034, + "mean_token_accuracy": 0.7061468362808228, + "num_tokens": 67005148.0, + "step": 2595 + }, + { + "epoch": 0.2850867559850648, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.747389316558838, + "learning_rate": 9.498535871156661e-07, + "loss": 1.033, + "mean_token_accuracy": 0.6823756694793701, + "num_tokens": 67024780.0, + "step": 2596 + }, + { + "epoch": 0.2851965736876785, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.305654764175415, + "learning_rate": 9.502196193265007e-07, + "loss": 1.0538, + "mean_token_accuracy": 0.693296492099762, + "num_tokens": 67051088.0, + "step": 2597 + }, + { + "epoch": 0.2853063913902921, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.6492855548858643, + "learning_rate": 9.505856515373352e-07, + "loss": 1.0222, + "mean_token_accuracy": 0.699458122253418, + "num_tokens": 67073148.0, + "step": 2598 + }, + { + "epoch": 0.28541620909290577, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.355232000350952, + "learning_rate": 9.509516837481698e-07, + "loss": 1.0961, + "mean_token_accuracy": 0.6836916208267212, + "num_tokens": 67103289.0, + "step": 2599 + }, + { + "epoch": 0.2855260267955194, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.2639997005462646, + "learning_rate": 9.513177159590043e-07, + "loss": 1.1493, + "mean_token_accuracy": 0.6633933782577515, + "num_tokens": 67134940.0, + "step": 2600 + }, + { + "epoch": 0.2856358444981331, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.277751922607422, + "learning_rate": 9.516837481698388e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6901615858078003, + "num_tokens": 67162861.0, + "step": 2601 + }, + { + "epoch": 0.28574566220074676, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.6565864086151123, + "learning_rate": 9.520497803806734e-07, + "loss": 1.0022, + "mean_token_accuracy": 0.6989074945449829, + "num_tokens": 67183291.0, + "step": 2602 + }, + { + "epoch": 0.2858554799033604, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.422913074493408, + "learning_rate": 9.52415812591508e-07, + "loss": 1.0361, + "mean_token_accuracy": 0.6933634877204895, + "num_tokens": 67208977.0, + "step": 2603 + }, + { + "epoch": 0.28596529760597406, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.4286842346191406, + "learning_rate": 9.527818448023425e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.7011570334434509, + "num_tokens": 67233401.0, + "step": 2604 + }, + { + "epoch": 0.28607511530858776, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.4110116958618164, + "learning_rate": 9.531478770131771e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.697342574596405, + "num_tokens": 67258898.0, + "step": 2605 + }, + { + "epoch": 0.2861849330112014, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.2250003814697266, + "learning_rate": 9.535139092240116e-07, + "loss": 1.0131, + "mean_token_accuracy": 0.7014333009719849, + "num_tokens": 67288247.0, + "step": 2606 + }, + { + "epoch": 0.28629475071381505, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 2.391723394393921, + "learning_rate": 9.538799414348462e-07, + "loss": 1.0266, + "mean_token_accuracy": 0.6964353322982788, + "num_tokens": 67312209.0, + "step": 2607 + }, + { + "epoch": 0.28640456841642875, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.400219202041626, + "learning_rate": 9.542459736456808e-07, + "loss": 1.0263, + "mean_token_accuracy": 0.6991718411445618, + "num_tokens": 67336959.0, + "step": 2608 + }, + { + "epoch": 0.2865143861190424, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.2046496868133545, + "learning_rate": 9.546120058565154e-07, + "loss": 1.0196, + "mean_token_accuracy": 0.694710910320282, + "num_tokens": 67365192.0, + "step": 2609 + }, + { + "epoch": 0.28662420382165604, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.3760905265808105, + "learning_rate": 9.549780380673498e-07, + "loss": 1.0204, + "mean_token_accuracy": 0.6955065131187439, + "num_tokens": 67392262.0, + "step": 2610 + }, + { + "epoch": 0.2867340215242697, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.24375057220459, + "learning_rate": 9.553440702781844e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.7000688314437866, + "num_tokens": 67419572.0, + "step": 2611 + }, + { + "epoch": 0.2868438392268834, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.6171488761901855, + "learning_rate": 9.55710102489019e-07, + "loss": 0.9665, + "mean_token_accuracy": 0.7163792848587036, + "num_tokens": 67440259.0, + "step": 2612 + }, + { + "epoch": 0.28695365692949704, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.521954298019409, + "learning_rate": 9.560761346998536e-07, + "loss": 0.9663, + "mean_token_accuracy": 0.7069350481033325, + "num_tokens": 67464103.0, + "step": 2613 + }, + { + "epoch": 0.2870634746321107, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.1030800342559814, + "learning_rate": 9.56442166910688e-07, + "loss": 1.1153, + "mean_token_accuracy": 0.6722081899642944, + "num_tokens": 67498304.0, + "step": 2614 + }, + { + "epoch": 0.2871732923347244, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.7034645080566406, + "learning_rate": 9.568081991215226e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6771966814994812, + "num_tokens": 67521821.0, + "step": 2615 + }, + { + "epoch": 0.28728311003733803, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.716057300567627, + "learning_rate": 9.571742313323572e-07, + "loss": 0.9844, + "mean_token_accuracy": 0.7082080841064453, + "num_tokens": 67541624.0, + "step": 2616 + }, + { + "epoch": 0.2873929277399517, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.52131724357605, + "learning_rate": 9.575402635431916e-07, + "loss": 1.0064, + "mean_token_accuracy": 0.7078765630722046, + "num_tokens": 67563863.0, + "step": 2617 + }, + { + "epoch": 0.2875027454425653, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 2.5180017948150635, + "learning_rate": 9.579062957540262e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6746594905853271, + "num_tokens": 67588335.0, + "step": 2618 + }, + { + "epoch": 0.287612563145179, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.608947992324829, + "learning_rate": 9.582723279648608e-07, + "loss": 0.9542, + "mean_token_accuracy": 0.7103349566459656, + "num_tokens": 67610513.0, + "step": 2619 + }, + { + "epoch": 0.28772238084779267, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.174293041229248, + "learning_rate": 9.586383601756954e-07, + "loss": 1.1526, + "mean_token_accuracy": 0.663521409034729, + "num_tokens": 67643857.0, + "step": 2620 + }, + { + "epoch": 0.2878321985504063, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.518293857574463, + "learning_rate": 9.5900439238653e-07, + "loss": 1.0007, + "mean_token_accuracy": 0.6964924335479736, + "num_tokens": 67665589.0, + "step": 2621 + }, + { + "epoch": 0.28794201625301996, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.1477694511413574, + "learning_rate": 9.593704245973644e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6947619915008545, + "num_tokens": 67695418.0, + "step": 2622 + }, + { + "epoch": 0.28805183395563366, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 2.595109701156616, + "learning_rate": 9.59736456808199e-07, + "loss": 1.0041, + "mean_token_accuracy": 0.6970020532608032, + "num_tokens": 67716503.0, + "step": 2623 + }, + { + "epoch": 0.2881616516582473, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.269442081451416, + "learning_rate": 9.601024890190336e-07, + "loss": 1.0133, + "mean_token_accuracy": 0.7018765807151794, + "num_tokens": 67743036.0, + "step": 2624 + }, + { + "epoch": 0.28827146936086095, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.141793966293335, + "learning_rate": 9.604685212298682e-07, + "loss": 1.0933, + "mean_token_accuracy": 0.6779361367225647, + "num_tokens": 67775477.0, + "step": 2625 + }, + { + "epoch": 0.28838128706347466, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.7004613876342773, + "learning_rate": 9.608345534407028e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6929796934127808, + "num_tokens": 67796246.0, + "step": 2626 + }, + { + "epoch": 0.2884911047660883, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 2.6936025619506836, + "learning_rate": 9.612005856515372e-07, + "loss": 1.0026, + "mean_token_accuracy": 0.7080317735671997, + "num_tokens": 67820758.0, + "step": 2627 + }, + { + "epoch": 0.28860092246870195, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.4626095294952393, + "learning_rate": 9.615666178623718e-07, + "loss": 1.0279, + "mean_token_accuracy": 0.6968789100646973, + "num_tokens": 67844105.0, + "step": 2628 + }, + { + "epoch": 0.2887107401713156, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.2110955715179443, + "learning_rate": 9.619326500732064e-07, + "loss": 1.1025, + "mean_token_accuracy": 0.671140193939209, + "num_tokens": 67876536.0, + "step": 2629 + }, + { + "epoch": 0.2888205578739293, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.2628133296966553, + "learning_rate": 9.62298682284041e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.7090871334075928, + "num_tokens": 67906726.0, + "step": 2630 + }, + { + "epoch": 0.28893037557654294, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.2944979667663574, + "learning_rate": 9.626647144948754e-07, + "loss": 1.0478, + "mean_token_accuracy": 0.6805652976036072, + "num_tokens": 67934853.0, + "step": 2631 + }, + { + "epoch": 0.2890401932791566, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.523338556289673, + "learning_rate": 9.6303074670571e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.7017149329185486, + "num_tokens": 67960487.0, + "step": 2632 + }, + { + "epoch": 0.2891500109817703, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 2.415775775909424, + "learning_rate": 9.633967789165446e-07, + "loss": 1.0177, + "mean_token_accuracy": 0.6955696940422058, + "num_tokens": 67985099.0, + "step": 2633 + }, + { + "epoch": 0.28925982868438394, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.781161069869995, + "learning_rate": 9.63762811127379e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6723939180374146, + "num_tokens": 68005907.0, + "step": 2634 + }, + { + "epoch": 0.2893696463869976, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.711505651473999, + "learning_rate": 9.641288433382138e-07, + "loss": 0.9859, + "mean_token_accuracy": 0.7094137668609619, + "num_tokens": 68025604.0, + "step": 2635 + }, + { + "epoch": 0.2894794640896112, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.207024335861206, + "learning_rate": 9.644948755490482e-07, + "loss": 0.9122, + "mean_token_accuracy": 0.7253357768058777, + "num_tokens": 68053834.0, + "step": 2636 + }, + { + "epoch": 0.28958928179222493, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.585554599761963, + "learning_rate": 9.648609077598828e-07, + "loss": 1.0394, + "mean_token_accuracy": 0.6900493502616882, + "num_tokens": 68075654.0, + "step": 2637 + }, + { + "epoch": 0.2896990994948386, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.466384172439575, + "learning_rate": 9.652269399707174e-07, + "loss": 1.0155, + "mean_token_accuracy": 0.7010323405265808, + "num_tokens": 68100679.0, + "step": 2638 + }, + { + "epoch": 0.2898089171974522, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.8798797130584717, + "learning_rate": 9.655929721815518e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.691373884677887, + "num_tokens": 68126152.0, + "step": 2639 + }, + { + "epoch": 0.28991873490006587, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.066270589828491, + "learning_rate": 9.659590043923866e-07, + "loss": 1.0494, + "mean_token_accuracy": 0.6926164627075195, + "num_tokens": 68160904.0, + "step": 2640 + }, + { + "epoch": 0.29002855260267957, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.175726890563965, + "learning_rate": 9.66325036603221e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6691535711288452, + "num_tokens": 68194352.0, + "step": 2641 + }, + { + "epoch": 0.2901383703052932, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.4038171768188477, + "learning_rate": 9.666910688140556e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6952660083770752, + "num_tokens": 68220095.0, + "step": 2642 + }, + { + "epoch": 0.29024818800790686, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.4420907497406006, + "learning_rate": 9.670571010248902e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.7027627229690552, + "num_tokens": 68245461.0, + "step": 2643 + }, + { + "epoch": 0.29035800571052056, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.4035236835479736, + "learning_rate": 9.674231332357246e-07, + "loss": 1.0169, + "mean_token_accuracy": 0.6924200057983398, + "num_tokens": 68270879.0, + "step": 2644 + }, + { + "epoch": 0.2904678234131342, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.568535566329956, + "learning_rate": 9.677891654465592e-07, + "loss": 0.9485, + "mean_token_accuracy": 0.7095713019371033, + "num_tokens": 68292893.0, + "step": 2645 + }, + { + "epoch": 0.29057764111574785, + "ewc_loss": 7.420778274536133e-06, + "grad_norm": 2.258394718170166, + "learning_rate": 9.681551976573938e-07, + "loss": 1.058, + "mean_token_accuracy": 0.6807268261909485, + "num_tokens": 68319075.0, + "step": 2646 + }, + { + "epoch": 0.2906874588183615, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.543914318084717, + "learning_rate": 9.685212298682284e-07, + "loss": 0.9654, + "mean_token_accuracy": 0.7095271348953247, + "num_tokens": 68342098.0, + "step": 2647 + }, + { + "epoch": 0.2907972765209752, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.2864301204681396, + "learning_rate": 9.688872620790628e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6701505780220032, + "num_tokens": 68371225.0, + "step": 2648 + }, + { + "epoch": 0.29090709422358885, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.3446083068847656, + "learning_rate": 9.692532942898974e-07, + "loss": 1.0657, + "mean_token_accuracy": 0.685663104057312, + "num_tokens": 68398188.0, + "step": 2649 + }, + { + "epoch": 0.2910169119262025, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.4608309268951416, + "learning_rate": 9.69619326500732e-07, + "loss": 0.9826, + "mean_token_accuracy": 0.7071747779846191, + "num_tokens": 68425520.0, + "step": 2650 + }, + { + "epoch": 0.29112672962881614, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.011970281600952, + "learning_rate": 9.699853587115666e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6763371229171753, + "num_tokens": 68462456.0, + "step": 2651 + }, + { + "epoch": 0.29123654733142984, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 2.398528575897217, + "learning_rate": 9.703513909224012e-07, + "loss": 1.1078, + "mean_token_accuracy": 0.676500141620636, + "num_tokens": 68489331.0, + "step": 2652 + }, + { + "epoch": 0.2913463650340435, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.3614330291748047, + "learning_rate": 9.707174231332356e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6878966093063354, + "num_tokens": 68516051.0, + "step": 2653 + }, + { + "epoch": 0.29145618273665713, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 2.3882193565368652, + "learning_rate": 9.710834553440702e-07, + "loss": 1.0558, + "mean_token_accuracy": 0.6951499581336975, + "num_tokens": 68539805.0, + "step": 2654 + }, + { + "epoch": 0.29156600043927083, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.3993747234344482, + "learning_rate": 9.714494875549048e-07, + "loss": 1.02, + "mean_token_accuracy": 0.6978551745414734, + "num_tokens": 68565221.0, + "step": 2655 + }, + { + "epoch": 0.2916758181418845, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.290757179260254, + "learning_rate": 9.718155197657394e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.7007814645767212, + "num_tokens": 68593703.0, + "step": 2656 + }, + { + "epoch": 0.2917856358444981, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.53604793548584, + "learning_rate": 9.72181551976574e-07, + "loss": 1.0072, + "mean_token_accuracy": 0.6994376182556152, + "num_tokens": 68615507.0, + "step": 2657 + }, + { + "epoch": 0.29189545354711177, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.2796528339385986, + "learning_rate": 9.725475841874084e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6978039741516113, + "num_tokens": 68644423.0, + "step": 2658 + }, + { + "epoch": 0.2920052712497255, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.288439989089966, + "learning_rate": 9.72913616398243e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.6935255527496338, + "num_tokens": 68670329.0, + "step": 2659 + }, + { + "epoch": 0.2921150889523391, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.665414571762085, + "learning_rate": 9.732796486090776e-07, + "loss": 1.0559, + "mean_token_accuracy": 0.6968246698379517, + "num_tokens": 68691542.0, + "step": 2660 + }, + { + "epoch": 0.29222490665495277, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.666184186935425, + "learning_rate": 9.736456808199122e-07, + "loss": 1.0041, + "mean_token_accuracy": 0.7004314661026001, + "num_tokens": 68713333.0, + "step": 2661 + }, + { + "epoch": 0.29233472435756647, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 2.508481502532959, + "learning_rate": 9.740117130307466e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.7056434750556946, + "num_tokens": 68735818.0, + "step": 2662 + }, + { + "epoch": 0.2924445420601801, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.3313772678375244, + "learning_rate": 9.743777452415812e-07, + "loss": 1.0357, + "mean_token_accuracy": 0.6944246888160706, + "num_tokens": 68762899.0, + "step": 2663 + }, + { + "epoch": 0.29255435976279376, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.4094510078430176, + "learning_rate": 9.747437774524158e-07, + "loss": 1.0511, + "mean_token_accuracy": 0.6972663402557373, + "num_tokens": 68786750.0, + "step": 2664 + }, + { + "epoch": 0.2926641774654074, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.5328850746154785, + "learning_rate": 9.751098096632502e-07, + "loss": 1.0003, + "mean_token_accuracy": 0.7022455930709839, + "num_tokens": 68810131.0, + "step": 2665 + }, + { + "epoch": 0.2927739951680211, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.393266201019287, + "learning_rate": 9.754758418740848e-07, + "loss": 1.1106, + "mean_token_accuracy": 0.6752409934997559, + "num_tokens": 68837302.0, + "step": 2666 + }, + { + "epoch": 0.29288381287063475, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.213108777999878, + "learning_rate": 9.758418740849194e-07, + "loss": 1.0331, + "mean_token_accuracy": 0.6998635530471802, + "num_tokens": 68866848.0, + "step": 2667 + }, + { + "epoch": 0.2929936305732484, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.5619683265686035, + "learning_rate": 9.76207906295754e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6831240653991699, + "num_tokens": 68892221.0, + "step": 2668 + }, + { + "epoch": 0.29310344827586204, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.2128758430480957, + "learning_rate": 9.765739385065886e-07, + "loss": 0.9988, + "mean_token_accuracy": 0.7024914622306824, + "num_tokens": 68920022.0, + "step": 2669 + }, + { + "epoch": 0.29321326597847575, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.617708444595337, + "learning_rate": 9.76939970717423e-07, + "loss": 0.9985, + "mean_token_accuracy": 0.7086721658706665, + "num_tokens": 68941852.0, + "step": 2670 + }, + { + "epoch": 0.2933230836810894, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.293879747390747, + "learning_rate": 9.773060029282576e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6817092895507812, + "num_tokens": 68973292.0, + "step": 2671 + }, + { + "epoch": 0.29343290138370304, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.7099320888519287, + "learning_rate": 9.776720351390922e-07, + "loss": 1.0698, + "mean_token_accuracy": 0.6896761655807495, + "num_tokens": 68995369.0, + "step": 2672 + }, + { + "epoch": 0.29354271908631674, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.4114010334014893, + "learning_rate": 9.780380673499268e-07, + "loss": 1.04, + "mean_token_accuracy": 0.685096263885498, + "num_tokens": 69018419.0, + "step": 2673 + }, + { + "epoch": 0.2936525367889304, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.803680181503296, + "learning_rate": 9.784040995607614e-07, + "loss": 0.992, + "mean_token_accuracy": 0.7089266180992126, + "num_tokens": 69039570.0, + "step": 2674 + }, + { + "epoch": 0.29376235449154403, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.4209306240081787, + "learning_rate": 9.787701317715958e-07, + "loss": 0.9902, + "mean_token_accuracy": 0.7090197801589966, + "num_tokens": 69065295.0, + "step": 2675 + }, + { + "epoch": 0.2938721721941577, + "ewc_loss": 7.599592208862305e-06, + "grad_norm": 2.5403215885162354, + "learning_rate": 9.791361639824304e-07, + "loss": 1.0099, + "mean_token_accuracy": 0.7061904668807983, + "num_tokens": 69088292.0, + "step": 2676 + }, + { + "epoch": 0.2939819898967714, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.677805185317993, + "learning_rate": 9.79502196193265e-07, + "loss": 0.9954, + "mean_token_accuracy": 0.6985260844230652, + "num_tokens": 69109009.0, + "step": 2677 + }, + { + "epoch": 0.294091807599385, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.2315540313720703, + "learning_rate": 9.798682284040996e-07, + "loss": 0.9896, + "mean_token_accuracy": 0.7113436460494995, + "num_tokens": 69135296.0, + "step": 2678 + }, + { + "epoch": 0.29420162530199867, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.124081611633301, + "learning_rate": 9.80234260614934e-07, + "loss": 1.0843, + "mean_token_accuracy": 0.6774399280548096, + "num_tokens": 69167474.0, + "step": 2679 + }, + { + "epoch": 0.2943114430046123, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.132100820541382, + "learning_rate": 9.806002928257686e-07, + "loss": 1.0648, + "mean_token_accuracy": 0.6899372339248657, + "num_tokens": 69199833.0, + "step": 2680 + }, + { + "epoch": 0.294421260707226, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.2489242553710938, + "learning_rate": 9.809663250366032e-07, + "loss": 1.0669, + "mean_token_accuracy": 0.6854892373085022, + "num_tokens": 69228030.0, + "step": 2681 + }, + { + "epoch": 0.29453107840983966, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.345019817352295, + "learning_rate": 9.813323572474376e-07, + "loss": 1.0605, + "mean_token_accuracy": 0.6798912286758423, + "num_tokens": 69253176.0, + "step": 2682 + }, + { + "epoch": 0.2946408961124533, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.305860757827759, + "learning_rate": 9.816983894582724e-07, + "loss": 0.9817, + "mean_token_accuracy": 0.7083216905593872, + "num_tokens": 69279049.0, + "step": 2683 + }, + { + "epoch": 0.294750713815067, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.101475954055786, + "learning_rate": 9.820644216691068e-07, + "loss": 1.0693, + "mean_token_accuracy": 0.6863142848014832, + "num_tokens": 69311963.0, + "step": 2684 + }, + { + "epoch": 0.29486053151768066, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.3150434494018555, + "learning_rate": 9.824304538799414e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.7121478319168091, + "num_tokens": 69338490.0, + "step": 2685 + }, + { + "epoch": 0.2949703492202943, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.5416581630706787, + "learning_rate": 9.82796486090776e-07, + "loss": 1.0, + "mean_token_accuracy": 0.6980584263801575, + "num_tokens": 69360211.0, + "step": 2686 + }, + { + "epoch": 0.29508016692290795, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 2.291653633117676, + "learning_rate": 9.831625183016104e-07, + "loss": 1.1651, + "mean_token_accuracy": 0.6721975803375244, + "num_tokens": 69389732.0, + "step": 2687 + }, + { + "epoch": 0.29518998462552165, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.5533926486968994, + "learning_rate": 9.835285505124452e-07, + "loss": 0.9348, + "mean_token_accuracy": 0.7151473760604858, + "num_tokens": 69412227.0, + "step": 2688 + }, + { + "epoch": 0.2952998023281353, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.472583055496216, + "learning_rate": 9.838945827232796e-07, + "loss": 1.0549, + "mean_token_accuracy": 0.6910828948020935, + "num_tokens": 69435342.0, + "step": 2689 + }, + { + "epoch": 0.29540962003074894, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.180121660232544, + "learning_rate": 9.842606149341142e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.6832041144371033, + "num_tokens": 69466875.0, + "step": 2690 + }, + { + "epoch": 0.29551943773336264, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.360699415206909, + "learning_rate": 9.846266471449488e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6997342109680176, + "num_tokens": 69493073.0, + "step": 2691 + }, + { + "epoch": 0.2956292554359763, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.2473630905151367, + "learning_rate": 9.849926793557832e-07, + "loss": 0.9842, + "mean_token_accuracy": 0.7043440341949463, + "num_tokens": 69520144.0, + "step": 2692 + }, + { + "epoch": 0.29573907313858994, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.405120849609375, + "learning_rate": 9.853587115666178e-07, + "loss": 1.0121, + "mean_token_accuracy": 0.6972470283508301, + "num_tokens": 69545286.0, + "step": 2693 + }, + { + "epoch": 0.2958488908412036, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.294224977493286, + "learning_rate": 9.857247437774524e-07, + "loss": 1.1453, + "mean_token_accuracy": 0.6719175577163696, + "num_tokens": 69575666.0, + "step": 2694 + }, + { + "epoch": 0.2959587085438173, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.5472490787506104, + "learning_rate": 9.86090775988287e-07, + "loss": 1.0065, + "mean_token_accuracy": 0.6980343461036682, + "num_tokens": 69597333.0, + "step": 2695 + }, + { + "epoch": 0.29606852624643093, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.9471049308776855, + "learning_rate": 9.864568081991214e-07, + "loss": 0.9791, + "mean_token_accuracy": 0.7055296301841736, + "num_tokens": 69615995.0, + "step": 2696 + }, + { + "epoch": 0.2961783439490446, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.501920461654663, + "learning_rate": 9.86822840409956e-07, + "loss": 1.1207, + "mean_token_accuracy": 0.6750835180282593, + "num_tokens": 69640263.0, + "step": 2697 + }, + { + "epoch": 0.2962881616516582, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.5356147289276123, + "learning_rate": 9.871888726207906e-07, + "loss": 1.0276, + "mean_token_accuracy": 0.7037374973297119, + "num_tokens": 69662277.0, + "step": 2698 + }, + { + "epoch": 0.2963979793542719, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.088099956512451, + "learning_rate": 9.875549048316252e-07, + "loss": 1.1041, + "mean_token_accuracy": 0.6777639389038086, + "num_tokens": 69694745.0, + "step": 2699 + }, + { + "epoch": 0.29650779705688557, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4291467666625977, + "learning_rate": 9.879209370424598e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.6937811970710754, + "num_tokens": 69720316.0, + "step": 2700 + }, + { + "epoch": 0.2966176147594992, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.276458263397217, + "learning_rate": 9.882869692532942e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.6867598295211792, + "num_tokens": 69750100.0, + "step": 2701 + }, + { + "epoch": 0.2967274324621129, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2797861099243164, + "learning_rate": 9.886530014641288e-07, + "loss": 1.0673, + "mean_token_accuracy": 0.6881171464920044, + "num_tokens": 69777792.0, + "step": 2702 + }, + { + "epoch": 0.29683725016472656, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.3906188011169434, + "learning_rate": 9.890190336749634e-07, + "loss": 1.0871, + "mean_token_accuracy": 0.6730233430862427, + "num_tokens": 69804220.0, + "step": 2703 + }, + { + "epoch": 0.2969470678673402, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.2311959266662598, + "learning_rate": 9.89385065885798e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.6742119789123535, + "num_tokens": 69837893.0, + "step": 2704 + }, + { + "epoch": 0.29705688556995385, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.4768214225769043, + "learning_rate": 9.897510980966326e-07, + "loss": 1.0031, + "mean_token_accuracy": 0.7093499898910522, + "num_tokens": 69862269.0, + "step": 2705 + }, + { + "epoch": 0.29716670327256756, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.202930450439453, + "learning_rate": 9.90117130307467e-07, + "loss": 1.1021, + "mean_token_accuracy": 0.6738789677619934, + "num_tokens": 69893396.0, + "step": 2706 + }, + { + "epoch": 0.2972765209751812, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.317650556564331, + "learning_rate": 9.904831625183016e-07, + "loss": 1.01, + "mean_token_accuracy": 0.6955061554908752, + "num_tokens": 69920648.0, + "step": 2707 + }, + { + "epoch": 0.29738633867779485, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2517876625061035, + "learning_rate": 9.908491947291362e-07, + "loss": 1.0174, + "mean_token_accuracy": 0.6970945000648499, + "num_tokens": 69947077.0, + "step": 2708 + }, + { + "epoch": 0.29749615638040855, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 4.09688138961792, + "learning_rate": 9.912152269399708e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6820414662361145, + "num_tokens": 69977260.0, + "step": 2709 + }, + { + "epoch": 0.2976059740830222, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.216926097869873, + "learning_rate": 9.915812591508052e-07, + "loss": 1.0195, + "mean_token_accuracy": 0.7010473608970642, + "num_tokens": 70006314.0, + "step": 2710 + }, + { + "epoch": 0.29771579178563584, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 2.358736038208008, + "learning_rate": 9.919472913616398e-07, + "loss": 1.0118, + "mean_token_accuracy": 0.6995733976364136, + "num_tokens": 70033098.0, + "step": 2711 + }, + { + "epoch": 0.2978256094882495, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.290311336517334, + "learning_rate": 9.923133235724744e-07, + "loss": 1.0289, + "mean_token_accuracy": 0.7023711204528809, + "num_tokens": 70062200.0, + "step": 2712 + }, + { + "epoch": 0.2979354271908632, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.263007164001465, + "learning_rate": 9.926793557833088e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6948488354682922, + "num_tokens": 70089711.0, + "step": 2713 + }, + { + "epoch": 0.29804524489347684, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3151650428771973, + "learning_rate": 9.930453879941434e-07, + "loss": 0.944, + "mean_token_accuracy": 0.719693660736084, + "num_tokens": 70113978.0, + "step": 2714 + }, + { + "epoch": 0.2981550625960905, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4281303882598877, + "learning_rate": 9.93411420204978e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.6991492509841919, + "num_tokens": 70138238.0, + "step": 2715 + }, + { + "epoch": 0.2982648802987041, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5705411434173584, + "learning_rate": 9.937774524158126e-07, + "loss": 0.9851, + "mean_token_accuracy": 0.7093960046768188, + "num_tokens": 70161906.0, + "step": 2716 + }, + { + "epoch": 0.29837469800131783, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1857542991638184, + "learning_rate": 9.941434846266472e-07, + "loss": 0.9809, + "mean_token_accuracy": 0.7059760093688965, + "num_tokens": 70192723.0, + "step": 2717 + }, + { + "epoch": 0.2984845157039315, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.2614662647247314, + "learning_rate": 9.945095168374816e-07, + "loss": 1.0108, + "mean_token_accuracy": 0.6989330053329468, + "num_tokens": 70221730.0, + "step": 2718 + }, + { + "epoch": 0.2985943334065451, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.339188575744629, + "learning_rate": 9.948755490483162e-07, + "loss": 0.9906, + "mean_token_accuracy": 0.7069066762924194, + "num_tokens": 70249072.0, + "step": 2719 + }, + { + "epoch": 0.2987041511091588, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.68335223197937, + "learning_rate": 9.952415812591508e-07, + "loss": 0.9884, + "mean_token_accuracy": 0.7034769058227539, + "num_tokens": 70270451.0, + "step": 2720 + }, + { + "epoch": 0.29881396881177247, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7953076362609863, + "learning_rate": 9.956076134699854e-07, + "loss": 0.9368, + "mean_token_accuracy": 0.7229268550872803, + "num_tokens": 70290888.0, + "step": 2721 + }, + { + "epoch": 0.2989237865143861, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5973658561706543, + "learning_rate": 9.9597364568082e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.6752109527587891, + "num_tokens": 70314777.0, + "step": 2722 + }, + { + "epoch": 0.29903360421699976, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.214993476867676, + "learning_rate": 9.963396778916544e-07, + "loss": 0.9978, + "mean_token_accuracy": 0.7041466236114502, + "num_tokens": 70343692.0, + "step": 2723 + }, + { + "epoch": 0.29914342191961346, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3604488372802734, + "learning_rate": 9.96705710102489e-07, + "loss": 1.0143, + "mean_token_accuracy": 0.6947677135467529, + "num_tokens": 70367561.0, + "step": 2724 + }, + { + "epoch": 0.2992532396222271, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.554697036743164, + "learning_rate": 9.970717423133236e-07, + "loss": 0.9468, + "mean_token_accuracy": 0.7095760107040405, + "num_tokens": 70389900.0, + "step": 2725 + }, + { + "epoch": 0.29936305732484075, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5216872692108154, + "learning_rate": 9.974377745241582e-07, + "loss": 0.9882, + "mean_token_accuracy": 0.706752598285675, + "num_tokens": 70415989.0, + "step": 2726 + }, + { + "epoch": 0.2994728750274544, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.115905284881592, + "learning_rate": 9.978038067349926e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6811959743499756, + "num_tokens": 70448872.0, + "step": 2727 + }, + { + "epoch": 0.2995826927300681, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.1624722480773926, + "learning_rate": 9.981698389458272e-07, + "loss": 1.0487, + "mean_token_accuracy": 0.6914931535720825, + "num_tokens": 70479507.0, + "step": 2728 + }, + { + "epoch": 0.29969251043268175, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.165863513946533, + "learning_rate": 9.985358711566618e-07, + "loss": 0.9945, + "mean_token_accuracy": 0.7001871466636658, + "num_tokens": 70510493.0, + "step": 2729 + }, + { + "epoch": 0.2998023281352954, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.691039562225342, + "learning_rate": 9.989019033674961e-07, + "loss": 0.9551, + "mean_token_accuracy": 0.7098547220230103, + "num_tokens": 70532115.0, + "step": 2730 + }, + { + "epoch": 0.2999121458379091, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.074699640274048, + "learning_rate": 9.99267935578331e-07, + "loss": 0.9278, + "mean_token_accuracy": 0.7096171975135803, + "num_tokens": 70563634.0, + "step": 2731 + }, + { + "epoch": 0.30002196354052274, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.3051531314849854, + "learning_rate": 9.996339677891654e-07, + "loss": 1.016, + "mean_token_accuracy": 0.7057511806488037, + "num_tokens": 70591530.0, + "step": 2732 + }, + { + "epoch": 0.3001317812431364, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7334482669830322, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6900436282157898, + "num_tokens": 70611898.0, + "step": 2733 + }, + { + "epoch": 0.30024159894575003, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.7221755981445312, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7193716168403625, + "num_tokens": 70631779.0, + "step": 2734 + }, + { + "epoch": 0.30035141664836373, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2213597297668457, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6960923671722412, + "num_tokens": 70660246.0, + "step": 2735 + }, + { + "epoch": 0.3004612343509774, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.219641923904419, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6896228790283203, + "num_tokens": 70691353.0, + "step": 2736 + }, + { + "epoch": 0.300571052053591, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.5979809761047363, + "learning_rate": 1e-06, + "loss": 1.11, + "mean_token_accuracy": 0.6728285551071167, + "num_tokens": 70714143.0, + "step": 2737 + }, + { + "epoch": 0.3006808697562047, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 2.4082958698272705, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6874195337295532, + "num_tokens": 70738976.0, + "step": 2738 + }, + { + "epoch": 0.3007906874588184, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.550999641418457, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7139720916748047, + "num_tokens": 70760448.0, + "step": 2739 + }, + { + "epoch": 0.300900505161432, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.256608486175537, + "learning_rate": 1e-06, + "loss": 1.1125, + "mean_token_accuracy": 0.6686690449714661, + "num_tokens": 70786483.0, + "step": 2740 + }, + { + "epoch": 0.30101032286404567, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.4643659591674805, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7088272571563721, + "num_tokens": 70809208.0, + "step": 2741 + }, + { + "epoch": 0.30112014056665937, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.483973264694214, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7059485912322998, + "num_tokens": 70834608.0, + "step": 2742 + }, + { + "epoch": 0.301229958269273, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.2855334281921387, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7103252410888672, + "num_tokens": 70861261.0, + "step": 2743 + }, + { + "epoch": 0.30133977597188666, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.329132080078125, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.685614824295044, + "num_tokens": 70889551.0, + "step": 2744 + }, + { + "epoch": 0.3014495936745003, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.343883991241455, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6928523778915405, + "num_tokens": 70916162.0, + "step": 2745 + }, + { + "epoch": 0.301559411377114, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 2.4123988151550293, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7081537842750549, + "num_tokens": 70944163.0, + "step": 2746 + }, + { + "epoch": 0.30166922907972765, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.339367151260376, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7048267126083374, + "num_tokens": 70970646.0, + "step": 2747 + }, + { + "epoch": 0.3017790467823413, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.1081461906433105, + "learning_rate": 1e-06, + "loss": 1.0961, + "mean_token_accuracy": 0.6770319938659668, + "num_tokens": 71003040.0, + "step": 2748 + }, + { + "epoch": 0.301888864484955, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.0916099548339844, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6803507208824158, + "num_tokens": 71034419.0, + "step": 2749 + }, + { + "epoch": 0.30199868218756865, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.479565143585205, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6926039457321167, + "num_tokens": 71060266.0, + "step": 2750 + }, + { + "epoch": 0.3021084998901823, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.4308626651763916, + "learning_rate": 1e-06, + "loss": 1.1322, + "mean_token_accuracy": 0.6648663282394409, + "num_tokens": 71085939.0, + "step": 2751 + }, + { + "epoch": 0.30221831759279594, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3777644634246826, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6922844648361206, + "num_tokens": 71110518.0, + "step": 2752 + }, + { + "epoch": 0.30232813529540964, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.5098719596862793, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.706856369972229, + "num_tokens": 71132356.0, + "step": 2753 + }, + { + "epoch": 0.3024379529980233, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.515136241912842, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6922429203987122, + "num_tokens": 71155990.0, + "step": 2754 + }, + { + "epoch": 0.30254777070063693, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.648190975189209, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.716310977935791, + "num_tokens": 71176187.0, + "step": 2755 + }, + { + "epoch": 0.3026575884032506, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.3982653617858887, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6975661516189575, + "num_tokens": 71200764.0, + "step": 2756 + }, + { + "epoch": 0.3027674061058643, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.405104875564575, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6936323642730713, + "num_tokens": 71228226.0, + "step": 2757 + }, + { + "epoch": 0.3028772238084779, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.356799840927124, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6904512643814087, + "num_tokens": 71252338.0, + "step": 2758 + }, + { + "epoch": 0.30298704151109157, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.559640645980835, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.680895209312439, + "num_tokens": 71276029.0, + "step": 2759 + }, + { + "epoch": 0.30309685921370527, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.457212448120117, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7158236503601074, + "num_tokens": 71300798.0, + "step": 2760 + }, + { + "epoch": 0.3032066769163189, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.1470654010772705, + "learning_rate": 1e-06, + "loss": 1.1156, + "mean_token_accuracy": 0.6707578897476196, + "num_tokens": 71335051.0, + "step": 2761 + }, + { + "epoch": 0.30331649461893256, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.4617481231689453, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6889674663543701, + "num_tokens": 71361661.0, + "step": 2762 + }, + { + "epoch": 0.3034263123215462, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.5179333686828613, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6996970176696777, + "num_tokens": 71382572.0, + "step": 2763 + }, + { + "epoch": 0.3035361300241599, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 2.5780694484710693, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7141619920730591, + "num_tokens": 71403425.0, + "step": 2764 + }, + { + "epoch": 0.30364594772677356, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.3850669860839844, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6935915350914001, + "num_tokens": 71428908.0, + "step": 2765 + }, + { + "epoch": 0.3037557654293872, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.3095703125, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7148895263671875, + "num_tokens": 71454772.0, + "step": 2766 + }, + { + "epoch": 0.3038655831320009, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.3923540115356445, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6838604211807251, + "num_tokens": 71482693.0, + "step": 2767 + }, + { + "epoch": 0.30397540083461455, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.55185866355896, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7147769927978516, + "num_tokens": 71504932.0, + "step": 2768 + }, + { + "epoch": 0.3040852185372282, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.9219586849212646, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7072773575782776, + "num_tokens": 71524792.0, + "step": 2769 + }, + { + "epoch": 0.30419503623984184, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2553822994232178, + "learning_rate": 1e-06, + "loss": 1.15, + "mean_token_accuracy": 0.6620413661003113, + "num_tokens": 71555938.0, + "step": 2770 + }, + { + "epoch": 0.30430485394245554, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 3.172804355621338, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6845623850822449, + "num_tokens": 71582278.0, + "step": 2771 + }, + { + "epoch": 0.3044146716450692, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4067978858947754, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6932412385940552, + "num_tokens": 71611249.0, + "step": 2772 + }, + { + "epoch": 0.30452448934768284, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.347367525100708, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6949079036712646, + "num_tokens": 71640985.0, + "step": 2773 + }, + { + "epoch": 0.3046343070502965, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.607740640640259, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7044499516487122, + "num_tokens": 71663987.0, + "step": 2774 + }, + { + "epoch": 0.3047441247529102, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.635343313217163, + "learning_rate": 1e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6796450614929199, + "num_tokens": 71688019.0, + "step": 2775 + }, + { + "epoch": 0.30485394245552383, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3371164798736572, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7143573760986328, + "num_tokens": 71715223.0, + "step": 2776 + }, + { + "epoch": 0.3049637601581375, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3594584465026855, + "learning_rate": 1e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.6781258583068848, + "num_tokens": 71742935.0, + "step": 2777 + }, + { + "epoch": 0.3050735778607512, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2465169429779053, + "learning_rate": 1e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.6673073768615723, + "num_tokens": 71774364.0, + "step": 2778 + }, + { + "epoch": 0.3051833955633648, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.1201298236846924, + "learning_rate": 1e-06, + "loss": 1.1497, + "mean_token_accuracy": 0.6677258610725403, + "num_tokens": 71808027.0, + "step": 2779 + }, + { + "epoch": 0.30529321326597847, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.5243852138519287, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7083378434181213, + "num_tokens": 71830722.0, + "step": 2780 + }, + { + "epoch": 0.3054030309685921, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 2.5216574668884277, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6997604370117188, + "num_tokens": 71852928.0, + "step": 2781 + }, + { + "epoch": 0.3055128486712058, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.185309410095215, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6914175152778625, + "num_tokens": 71882433.0, + "step": 2782 + }, + { + "epoch": 0.30562266637381946, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.244515895843506, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6856828331947327, + "num_tokens": 71914594.0, + "step": 2783 + }, + { + "epoch": 0.3057324840764331, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.5141682624816895, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.6815661191940308, + "num_tokens": 71938171.0, + "step": 2784 + }, + { + "epoch": 0.30584230177904675, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3429386615753174, + "learning_rate": 1e-06, + "loss": 1.1076, + "mean_token_accuracy": 0.6893171668052673, + "num_tokens": 71966619.0, + "step": 2785 + }, + { + "epoch": 0.30595211948166046, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.2975850105285645, + "learning_rate": 1e-06, + "loss": 1.1312, + "mean_token_accuracy": 0.6660072207450867, + "num_tokens": 71996236.0, + "step": 2786 + }, + { + "epoch": 0.3060619371842741, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.547412157058716, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6750134229660034, + "num_tokens": 72020186.0, + "step": 2787 + }, + { + "epoch": 0.30617175488688775, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3005449771881104, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7046567797660828, + "num_tokens": 72045722.0, + "step": 2788 + }, + { + "epoch": 0.30628157258950145, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4351963996887207, + "learning_rate": 1e-06, + "loss": 1.1253, + "mean_token_accuracy": 0.6725583076477051, + "num_tokens": 72072531.0, + "step": 2789 + }, + { + "epoch": 0.3063913902921151, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.3907878398895264, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6960699558258057, + "num_tokens": 72098928.0, + "step": 2790 + }, + { + "epoch": 0.30650120799472874, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4819984436035156, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6857553720474243, + "num_tokens": 72123697.0, + "step": 2791 + }, + { + "epoch": 0.3066110256973424, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.767570972442627, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7034355998039246, + "num_tokens": 72144223.0, + "step": 2792 + }, + { + "epoch": 0.3067208433999561, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.4769585132598877, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7061859369277954, + "num_tokens": 72167845.0, + "step": 2793 + }, + { + "epoch": 0.30683066110256974, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.588243007659912, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6978141069412231, + "num_tokens": 72190607.0, + "step": 2794 + }, + { + "epoch": 0.3069404788051834, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.4630191326141357, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6898903846740723, + "num_tokens": 72216993.0, + "step": 2795 + }, + { + "epoch": 0.3070502965077971, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.5024595260620117, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7060377597808838, + "num_tokens": 72240619.0, + "step": 2796 + }, + { + "epoch": 0.30716011421041073, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 2.6703152656555176, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.68739914894104, + "num_tokens": 72261970.0, + "step": 2797 + }, + { + "epoch": 0.3072699319130244, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.6597487926483154, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.6795283555984497, + "num_tokens": 72286977.0, + "step": 2798 + }, + { + "epoch": 0.307379749615638, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.298067331314087, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6893054246902466, + "num_tokens": 72312612.0, + "step": 2799 + }, + { + "epoch": 0.3074895673182517, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.1715219020843506, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6827894449234009, + "num_tokens": 72342397.0, + "step": 2800 + }, + { + "epoch": 0.30759938502086537, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.140159845352173, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6843140125274658, + "num_tokens": 72373431.0, + "step": 2801 + }, + { + "epoch": 0.307709202723479, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.4142396450042725, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6936756372451782, + "num_tokens": 72399898.0, + "step": 2802 + }, + { + "epoch": 0.30781902042609266, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.429417133331299, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7515127658843994, + "num_tokens": 72421490.0, + "step": 2803 + }, + { + "epoch": 0.30792883812870636, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.691885232925415, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7152706980705261, + "num_tokens": 72441399.0, + "step": 2804 + }, + { + "epoch": 0.30803865583132, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.584951877593994, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7187328338623047, + "num_tokens": 72465359.0, + "step": 2805 + }, + { + "epoch": 0.30814847353393365, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.326448678970337, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7067586183547974, + "num_tokens": 72492832.0, + "step": 2806 + }, + { + "epoch": 0.30825829123654735, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.5720081329345703, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6903819441795349, + "num_tokens": 72517740.0, + "step": 2807 + }, + { + "epoch": 0.308368108939161, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.223907232284546, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7173308730125427, + "num_tokens": 72546634.0, + "step": 2808 + }, + { + "epoch": 0.30847792664177465, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.3306422233581543, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6946592330932617, + "num_tokens": 72574884.0, + "step": 2809 + }, + { + "epoch": 0.3085877443443883, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.4293465614318848, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.711838960647583, + "num_tokens": 72599568.0, + "step": 2810 + }, + { + "epoch": 0.308697562047002, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.492427110671997, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7053411602973938, + "num_tokens": 72623881.0, + "step": 2811 + }, + { + "epoch": 0.30880737974961564, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.0830793380737305, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6964139342308044, + "num_tokens": 72656182.0, + "step": 2812 + }, + { + "epoch": 0.3089171974522293, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.3521242141723633, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.714803159236908, + "num_tokens": 72681467.0, + "step": 2813 + }, + { + "epoch": 0.309027015154843, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.650010585784912, + "learning_rate": 1e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.6836464405059814, + "num_tokens": 72704525.0, + "step": 2814 + }, + { + "epoch": 0.30913683285745663, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.406951904296875, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6861499547958374, + "num_tokens": 72729940.0, + "step": 2815 + }, + { + "epoch": 0.3092466505600703, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.841557502746582, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7086685299873352, + "num_tokens": 72754018.0, + "step": 2816 + }, + { + "epoch": 0.3093564682626839, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.394803762435913, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6837747097015381, + "num_tokens": 72780723.0, + "step": 2817 + }, + { + "epoch": 0.3094662859652976, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5281736850738525, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6791425943374634, + "num_tokens": 72809617.0, + "step": 2818 + }, + { + "epoch": 0.3095761036679113, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.743041753768921, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7361130118370056, + "num_tokens": 72829097.0, + "step": 2819 + }, + { + "epoch": 0.3096859213705249, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.312335252761841, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7054760456085205, + "num_tokens": 72856357.0, + "step": 2820 + }, + { + "epoch": 0.30979573907313857, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.3824355602264404, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.707321047782898, + "num_tokens": 72882233.0, + "step": 2821 + }, + { + "epoch": 0.30990555677575227, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.520449161529541, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7091397047042847, + "num_tokens": 72905153.0, + "step": 2822 + }, + { + "epoch": 0.3100153744783659, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 2.757840633392334, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7167925834655762, + "num_tokens": 72926296.0, + "step": 2823 + }, + { + "epoch": 0.31012519218097956, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.24761962890625, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6906330585479736, + "num_tokens": 72955950.0, + "step": 2824 + }, + { + "epoch": 0.31023500988359326, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4541819095611572, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6797074675559998, + "num_tokens": 72982794.0, + "step": 2825 + }, + { + "epoch": 0.3103448275862069, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3731915950775146, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6981318593025208, + "num_tokens": 73007331.0, + "step": 2826 + }, + { + "epoch": 0.31045464528882055, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.636099338531494, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6936159729957581, + "num_tokens": 73027103.0, + "step": 2827 + }, + { + "epoch": 0.3105644629914342, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5120465755462646, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.694696307182312, + "num_tokens": 73049063.0, + "step": 2828 + }, + { + "epoch": 0.3106742806940479, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.3436384201049805, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6944074630737305, + "num_tokens": 73078869.0, + "step": 2829 + }, + { + "epoch": 0.31078409839666155, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4773170948028564, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6980828642845154, + "num_tokens": 73105191.0, + "step": 2830 + }, + { + "epoch": 0.3108939160992752, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.334587812423706, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6886050701141357, + "num_tokens": 73131768.0, + "step": 2831 + }, + { + "epoch": 0.31100373380188884, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.4053940773010254, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6967437267303467, + "num_tokens": 73158912.0, + "step": 2832 + }, + { + "epoch": 0.31111355150450254, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.501401901245117, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6999773979187012, + "num_tokens": 73183322.0, + "step": 2833 + }, + { + "epoch": 0.3112233692071162, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2668371200561523, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.697772204875946, + "num_tokens": 73212338.0, + "step": 2834 + }, + { + "epoch": 0.31133318690972983, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.908820152282715, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.689430832862854, + "num_tokens": 73241951.0, + "step": 2835 + }, + { + "epoch": 0.31144300461234353, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.37312912940979, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7164344787597656, + "num_tokens": 73269042.0, + "step": 2836 + }, + { + "epoch": 0.3115528223149572, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.363100528717041, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6999068260192871, + "num_tokens": 73295104.0, + "step": 2837 + }, + { + "epoch": 0.3116626400175708, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.2973644733428955, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6825650930404663, + "num_tokens": 73322207.0, + "step": 2838 + }, + { + "epoch": 0.31177245772018447, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.55946683883667, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6905996799468994, + "num_tokens": 73344542.0, + "step": 2839 + }, + { + "epoch": 0.31188227542279817, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.5793027877807617, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7027155160903931, + "num_tokens": 73366839.0, + "step": 2840 + }, + { + "epoch": 0.3119920931254118, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.526637077331543, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6963914632797241, + "num_tokens": 73390568.0, + "step": 2841 + }, + { + "epoch": 0.31210191082802546, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 2.612504720687866, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7113587856292725, + "num_tokens": 73411547.0, + "step": 2842 + }, + { + "epoch": 0.31221172853063917, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4900317192077637, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6950693726539612, + "num_tokens": 73436034.0, + "step": 2843 + }, + { + "epoch": 0.3123215462332528, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.353790283203125, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6945846676826477, + "num_tokens": 73461319.0, + "step": 2844 + }, + { + "epoch": 0.31243136393586646, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.5795650482177734, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6927705407142639, + "num_tokens": 73483607.0, + "step": 2845 + }, + { + "epoch": 0.3125411816384801, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3457658290863037, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6799507141113281, + "num_tokens": 73510148.0, + "step": 2846 + }, + { + "epoch": 0.3126509993410938, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 3.0048060417175293, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7356820106506348, + "num_tokens": 73527606.0, + "step": 2847 + }, + { + "epoch": 0.31276081704370745, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.6438350677490234, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6998152136802673, + "num_tokens": 73548554.0, + "step": 2848 + }, + { + "epoch": 0.3128706347463211, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.188354969024658, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7246071100234985, + "num_tokens": 73575796.0, + "step": 2849 + }, + { + "epoch": 0.31298045244893474, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.263256788253784, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6985839605331421, + "num_tokens": 73605313.0, + "step": 2850 + }, + { + "epoch": 0.31309027015154844, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4129035472869873, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6880702376365662, + "num_tokens": 73630136.0, + "step": 2851 + }, + { + "epoch": 0.3132000878541621, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.629504919052124, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.712783932685852, + "num_tokens": 73650701.0, + "step": 2852 + }, + { + "epoch": 0.31330990555677574, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4254226684570312, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7178599238395691, + "num_tokens": 73675451.0, + "step": 2853 + }, + { + "epoch": 0.31341972325938944, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.741716146469116, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6914502382278442, + "num_tokens": 73699631.0, + "step": 2854 + }, + { + "epoch": 0.3135295409620031, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.1170566082000732, + "learning_rate": 1e-06, + "loss": 1.1656, + "mean_token_accuracy": 0.6673489212989807, + "num_tokens": 73734841.0, + "step": 2855 + }, + { + "epoch": 0.31363935866461673, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.17785382270813, + "learning_rate": 1e-06, + "loss": 1.0842, + "mean_token_accuracy": 0.6764534711837769, + "num_tokens": 73766927.0, + "step": 2856 + }, + { + "epoch": 0.3137491763672304, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2393767833709717, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6956208348274231, + "num_tokens": 73794957.0, + "step": 2857 + }, + { + "epoch": 0.3138589940698441, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.7859044075012207, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6986940503120422, + "num_tokens": 73814952.0, + "step": 2858 + }, + { + "epoch": 0.3139688117724577, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3129727840423584, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6910282969474792, + "num_tokens": 73840972.0, + "step": 2859 + }, + { + "epoch": 0.31407862947507137, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.156170129776001, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6814638376235962, + "num_tokens": 73871707.0, + "step": 2860 + }, + { + "epoch": 0.314188447177685, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4557971954345703, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7054715156555176, + "num_tokens": 73896564.0, + "step": 2861 + }, + { + "epoch": 0.3142982648802987, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.3544230461120605, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7029905319213867, + "num_tokens": 73923668.0, + "step": 2862 + }, + { + "epoch": 0.31440808258291236, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.312002182006836, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6862639784812927, + "num_tokens": 73949813.0, + "step": 2863 + }, + { + "epoch": 0.314517900285526, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.599149227142334, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6876726150512695, + "num_tokens": 73972394.0, + "step": 2864 + }, + { + "epoch": 0.3146277179881397, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.4448747634887695, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.697006106376648, + "num_tokens": 73996907.0, + "step": 2865 + }, + { + "epoch": 0.31473753569075336, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.302262306213379, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7237675786018372, + "num_tokens": 74023279.0, + "step": 2866 + }, + { + "epoch": 0.314847353393367, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.197722911834717, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.697650671005249, + "num_tokens": 74055139.0, + "step": 2867 + }, + { + "epoch": 0.31495717109598065, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.6566390991210938, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7035571336746216, + "num_tokens": 74077381.0, + "step": 2868 + }, + { + "epoch": 0.31506698879859435, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.451870918273926, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6954787969589233, + "num_tokens": 74102507.0, + "step": 2869 + }, + { + "epoch": 0.315176806501208, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.2821574211120605, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6813795566558838, + "num_tokens": 74131402.0, + "step": 2870 + }, + { + "epoch": 0.31528662420382164, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.765820026397705, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7137877941131592, + "num_tokens": 74150490.0, + "step": 2871 + }, + { + "epoch": 0.31539644190643534, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.33113956451416, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6789982914924622, + "num_tokens": 74177232.0, + "step": 2872 + }, + { + "epoch": 0.315506259609049, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 2.492436647415161, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7159005403518677, + "num_tokens": 74201663.0, + "step": 2873 + }, + { + "epoch": 0.31561607731166264, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4352927207946777, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7006694674491882, + "num_tokens": 74225721.0, + "step": 2874 + }, + { + "epoch": 0.3157258950142763, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.34297776222229, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7015154957771301, + "num_tokens": 74254833.0, + "step": 2875 + }, + { + "epoch": 0.31583571271689, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.398141384124756, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.696472704410553, + "num_tokens": 74280086.0, + "step": 2876 + }, + { + "epoch": 0.31594553041950363, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.317941188812256, + "learning_rate": 1e-06, + "loss": 1.1463, + "mean_token_accuracy": 0.6790428757667542, + "num_tokens": 74308202.0, + "step": 2877 + }, + { + "epoch": 0.3160553481221173, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.058736801147461, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6952859163284302, + "num_tokens": 74340124.0, + "step": 2878 + }, + { + "epoch": 0.3161651658247309, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.618579149246216, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.703179121017456, + "num_tokens": 74363775.0, + "step": 2879 + }, + { + "epoch": 0.3162749835273446, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5952320098876953, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7128989696502686, + "num_tokens": 74385059.0, + "step": 2880 + }, + { + "epoch": 0.31638480122995827, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1527957916259766, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6914750337600708, + "num_tokens": 74415274.0, + "step": 2881 + }, + { + "epoch": 0.3164946189325719, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.9484646320343018, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6963948607444763, + "num_tokens": 74433509.0, + "step": 2882 + }, + { + "epoch": 0.3166044366351856, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3942224979400635, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.7081265449523926, + "num_tokens": 74456649.0, + "step": 2883 + }, + { + "epoch": 0.31671425433779926, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.684530735015869, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6955621242523193, + "num_tokens": 74478535.0, + "step": 2884 + }, + { + "epoch": 0.3168240720404129, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.403574228286743, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7098777294158936, + "num_tokens": 74501778.0, + "step": 2885 + }, + { + "epoch": 0.31693388974302655, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.329843759536743, + "learning_rate": 1e-06, + "loss": 1.1117, + "mean_token_accuracy": 0.674546480178833, + "num_tokens": 74530204.0, + "step": 2886 + }, + { + "epoch": 0.31704370744564025, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4613118171691895, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7370203733444214, + "num_tokens": 74552726.0, + "step": 2887 + }, + { + "epoch": 0.3171535251482539, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.517256736755371, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.7034965753555298, + "num_tokens": 74574343.0, + "step": 2888 + }, + { + "epoch": 0.31726334285086755, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.496634006500244, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7049190998077393, + "num_tokens": 74597489.0, + "step": 2889 + }, + { + "epoch": 0.31737316055348125, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.233403444290161, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6981818675994873, + "num_tokens": 74630000.0, + "step": 2890 + }, + { + "epoch": 0.3174829782560949, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.365281343460083, + "learning_rate": 1e-06, + "loss": 1.1192, + "mean_token_accuracy": 0.676595151424408, + "num_tokens": 74656838.0, + "step": 2891 + }, + { + "epoch": 0.31759279595870854, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.368570327758789, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6917049884796143, + "num_tokens": 74680459.0, + "step": 2892 + }, + { + "epoch": 0.3177026136613222, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.167048454284668, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7066658139228821, + "num_tokens": 74709266.0, + "step": 2893 + }, + { + "epoch": 0.3178124313639359, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.120403528213501, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.714616596698761, + "num_tokens": 74737699.0, + "step": 2894 + }, + { + "epoch": 0.31792224906654953, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.5154097080230713, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7058830857276917, + "num_tokens": 74761079.0, + "step": 2895 + }, + { + "epoch": 0.3180320667691632, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.195122480392456, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7005796432495117, + "num_tokens": 74790938.0, + "step": 2896 + }, + { + "epoch": 0.3181418844717768, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.1212551593780518, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7071218490600586, + "num_tokens": 74819771.0, + "step": 2897 + }, + { + "epoch": 0.3182517021743905, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.4318907260894775, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.697019100189209, + "num_tokens": 74845767.0, + "step": 2898 + }, + { + "epoch": 0.3183615198770042, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.2543773651123047, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.705557644367218, + "num_tokens": 74873917.0, + "step": 2899 + }, + { + "epoch": 0.3184713375796178, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.417234182357788, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7150694727897644, + "num_tokens": 74898683.0, + "step": 2900 + }, + { + "epoch": 0.3185811552822315, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.367384433746338, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6907403469085693, + "num_tokens": 74923942.0, + "step": 2901 + }, + { + "epoch": 0.31869097298484517, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.261474847793579, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7093232274055481, + "num_tokens": 74950771.0, + "step": 2902 + }, + { + "epoch": 0.3188007906874588, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.6107757091522217, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6951020956039429, + "num_tokens": 74973253.0, + "step": 2903 + }, + { + "epoch": 0.31891060839007246, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 2.3735222816467285, + "learning_rate": 1e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6786326169967651, + "num_tokens": 74999470.0, + "step": 2904 + }, + { + "epoch": 0.31902042609268616, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.7098731994628906, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6869394779205322, + "num_tokens": 75019191.0, + "step": 2905 + }, + { + "epoch": 0.3191302437952998, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.2687761783599854, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6844267845153809, + "num_tokens": 75047634.0, + "step": 2906 + }, + { + "epoch": 0.31924006149791345, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.240077257156372, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7036590576171875, + "num_tokens": 75077206.0, + "step": 2907 + }, + { + "epoch": 0.3193498792005271, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.4883196353912354, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6979990005493164, + "num_tokens": 75103531.0, + "step": 2908 + }, + { + "epoch": 0.3194596969031408, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.160581350326538, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7125489711761475, + "num_tokens": 75135218.0, + "step": 2909 + }, + { + "epoch": 0.31956951460575445, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.3680500984191895, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6710894107818604, + "num_tokens": 75162189.0, + "step": 2910 + }, + { + "epoch": 0.3196793323083681, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.591618061065674, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7250703573226929, + "num_tokens": 75183522.0, + "step": 2911 + }, + { + "epoch": 0.3197891500109818, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.2342262268066406, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7033294439315796, + "num_tokens": 75211860.0, + "step": 2912 + }, + { + "epoch": 0.31989896771359544, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.030405044555664, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6830877661705017, + "num_tokens": 75250506.0, + "step": 2913 + }, + { + "epoch": 0.3200087854162091, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.102177858352661, + "learning_rate": 1e-06, + "loss": 1.1321, + "mean_token_accuracy": 0.6734223961830139, + "num_tokens": 75285783.0, + "step": 2914 + }, + { + "epoch": 0.32011860311882273, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.119398593902588, + "learning_rate": 1e-06, + "loss": 1.1639, + "mean_token_accuracy": 0.6556441783905029, + "num_tokens": 75318059.0, + "step": 2915 + }, + { + "epoch": 0.32022842082143643, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.344229221343994, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6939183473587036, + "num_tokens": 75345039.0, + "step": 2916 + }, + { + "epoch": 0.3203382385240501, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.1920580863952637, + "learning_rate": 1e-06, + "loss": 1.1117, + "mean_token_accuracy": 0.6817795038223267, + "num_tokens": 75374543.0, + "step": 2917 + }, + { + "epoch": 0.3204480562266637, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.612835645675659, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.7014809846878052, + "num_tokens": 75397039.0, + "step": 2918 + }, + { + "epoch": 0.3205578739292774, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.418334722518921, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6896119117736816, + "num_tokens": 75422381.0, + "step": 2919 + }, + { + "epoch": 0.32066769163189107, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.6697311401367188, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7202539443969727, + "num_tokens": 75446255.0, + "step": 2920 + }, + { + "epoch": 0.3207775093345047, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.529557228088379, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7198126316070557, + "num_tokens": 75468639.0, + "step": 2921 + }, + { + "epoch": 0.32088732703711836, + "ewc_loss": 8.404254913330078e-06, + "grad_norm": 2.2812657356262207, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7200131416320801, + "num_tokens": 75492994.0, + "step": 2922 + }, + { + "epoch": 0.32099714473973207, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.2485969066619873, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6901741027832031, + "num_tokens": 75520922.0, + "step": 2923 + }, + { + "epoch": 0.3211069624423457, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.207885980606079, + "learning_rate": 1e-06, + "loss": 1.091, + "mean_token_accuracy": 0.6878988146781921, + "num_tokens": 75548894.0, + "step": 2924 + }, + { + "epoch": 0.32121678014495936, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.678379535675049, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6920092105865479, + "num_tokens": 75569737.0, + "step": 2925 + }, + { + "epoch": 0.321326597847573, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.311849355697632, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6997730731964111, + "num_tokens": 75594651.0, + "step": 2926 + }, + { + "epoch": 0.3214364155501867, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.3112590312957764, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6965739130973816, + "num_tokens": 75620646.0, + "step": 2927 + }, + { + "epoch": 0.32154623325280035, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.4873745441436768, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7132763862609863, + "num_tokens": 75642082.0, + "step": 2928 + }, + { + "epoch": 0.321656050955414, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.1937990188598633, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6874978542327881, + "num_tokens": 75671471.0, + "step": 2929 + }, + { + "epoch": 0.3217658686580277, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.339427947998047, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6846017837524414, + "num_tokens": 75698969.0, + "step": 2930 + }, + { + "epoch": 0.32187568636064134, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 2.2580814361572266, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.72489994764328, + "num_tokens": 75726030.0, + "step": 2931 + }, + { + "epoch": 0.321985504063255, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.527677297592163, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7094168663024902, + "num_tokens": 75749806.0, + "step": 2932 + }, + { + "epoch": 0.32209532176586864, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.617661714553833, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7007238864898682, + "num_tokens": 75774356.0, + "step": 2933 + }, + { + "epoch": 0.32220513946848234, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.3469903469085693, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6864002346992493, + "num_tokens": 75801477.0, + "step": 2934 + }, + { + "epoch": 0.322314957171096, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.3764379024505615, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7103644013404846, + "num_tokens": 75826278.0, + "step": 2935 + }, + { + "epoch": 0.32242477487370963, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.284536838531494, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6991108059883118, + "num_tokens": 75853384.0, + "step": 2936 + }, + { + "epoch": 0.3225345925763233, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.503483772277832, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7140388488769531, + "num_tokens": 75875598.0, + "step": 2937 + }, + { + "epoch": 0.322644410278937, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.374276638031006, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6991949081420898, + "num_tokens": 75901014.0, + "step": 2938 + }, + { + "epoch": 0.3227542279815506, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.3896610736846924, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7060166001319885, + "num_tokens": 75927081.0, + "step": 2939 + }, + { + "epoch": 0.32286404568416427, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.695035457611084, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7357882857322693, + "num_tokens": 75947874.0, + "step": 2940 + }, + { + "epoch": 0.32297386338677797, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.3399293422698975, + "learning_rate": 1e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.6764894723892212, + "num_tokens": 75974473.0, + "step": 2941 + }, + { + "epoch": 0.3230836810893916, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.5783119201660156, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7034952044487, + "num_tokens": 75997171.0, + "step": 2942 + }, + { + "epoch": 0.32319349879200526, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.554246664047241, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7037744522094727, + "num_tokens": 76020257.0, + "step": 2943 + }, + { + "epoch": 0.3233033164946189, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.4535603523254395, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7040534615516663, + "num_tokens": 76042908.0, + "step": 2944 + }, + { + "epoch": 0.3234131341972326, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.263197183609009, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.69768226146698, + "num_tokens": 76071996.0, + "step": 2945 + }, + { + "epoch": 0.32352295189984626, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.4392900466918945, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7216106653213501, + "num_tokens": 76095474.0, + "step": 2946 + }, + { + "epoch": 0.3236327696024599, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.5210397243499756, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.709665060043335, + "num_tokens": 76119182.0, + "step": 2947 + }, + { + "epoch": 0.3237425873050736, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.417005777359009, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7132693529129028, + "num_tokens": 76141518.0, + "step": 2948 + }, + { + "epoch": 0.32385240500768725, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.2680344581604004, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6969300508499146, + "num_tokens": 76166749.0, + "step": 2949 + }, + { + "epoch": 0.3239622227103009, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.3371410369873047, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7193103432655334, + "num_tokens": 76194031.0, + "step": 2950 + }, + { + "epoch": 0.32407204041291454, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.359597682952881, + "learning_rate": 1e-06, + "loss": 1.1506, + "mean_token_accuracy": 0.6709374189376831, + "num_tokens": 76220946.0, + "step": 2951 + }, + { + "epoch": 0.32418185811552824, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.271559953689575, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6996405720710754, + "num_tokens": 76247852.0, + "step": 2952 + }, + { + "epoch": 0.3242916758181419, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.7739696502685547, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7441188097000122, + "num_tokens": 76265952.0, + "step": 2953 + }, + { + "epoch": 0.32440149352075554, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.277190923690796, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6861143112182617, + "num_tokens": 76295479.0, + "step": 2954 + }, + { + "epoch": 0.3245113112233692, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 2.444709300994873, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7121042013168335, + "num_tokens": 76319832.0, + "step": 2955 + }, + { + "epoch": 0.3246211289259829, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.606438398361206, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6978156566619873, + "num_tokens": 76346144.0, + "step": 2956 + }, + { + "epoch": 0.32473094662859653, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.3187217712402344, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7135207056999207, + "num_tokens": 76373994.0, + "step": 2957 + }, + { + "epoch": 0.3248407643312102, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.342634677886963, + "learning_rate": 1e-06, + "loss": 1.1372, + "mean_token_accuracy": 0.6662388443946838, + "num_tokens": 76404158.0, + "step": 2958 + }, + { + "epoch": 0.3249505820338239, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.017461061477661, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.688866376876831, + "num_tokens": 76440125.0, + "step": 2959 + }, + { + "epoch": 0.3250603997364375, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.6125564575195312, + "learning_rate": 1e-06, + "loss": 1.0842, + "mean_token_accuracy": 0.6808855533599854, + "num_tokens": 76464590.0, + "step": 2960 + }, + { + "epoch": 0.32517021743905117, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.4293296337127686, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6842507719993591, + "num_tokens": 76488566.0, + "step": 2961 + }, + { + "epoch": 0.3252800351416648, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.302055835723877, + "learning_rate": 1e-06, + "loss": 1.1283, + "mean_token_accuracy": 0.683167576789856, + "num_tokens": 76517572.0, + "step": 2962 + }, + { + "epoch": 0.3253898528442785, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.555640697479248, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6894834041595459, + "num_tokens": 76539776.0, + "step": 2963 + }, + { + "epoch": 0.32549967054689216, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2286877632141113, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7182698249816895, + "num_tokens": 76566605.0, + "step": 2964 + }, + { + "epoch": 0.3256094882495058, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.3637897968292236, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7090638875961304, + "num_tokens": 76591486.0, + "step": 2965 + }, + { + "epoch": 0.3257193059521195, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.611630439758301, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7045375108718872, + "num_tokens": 76611866.0, + "step": 2966 + }, + { + "epoch": 0.32582912365473315, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.404637336730957, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6972348690032959, + "num_tokens": 76637126.0, + "step": 2967 + }, + { + "epoch": 0.3259389413573468, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.5404932498931885, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6978231072425842, + "num_tokens": 76659032.0, + "step": 2968 + }, + { + "epoch": 0.32604875905996045, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.6035172939300537, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7022184133529663, + "num_tokens": 76681382.0, + "step": 2969 + }, + { + "epoch": 0.32615857676257415, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.5436694622039795, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7015448212623596, + "num_tokens": 76703872.0, + "step": 2970 + }, + { + "epoch": 0.3262683944651878, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.242594003677368, + "learning_rate": 1e-06, + "loss": 1.137, + "mean_token_accuracy": 0.6672831177711487, + "num_tokens": 76732208.0, + "step": 2971 + }, + { + "epoch": 0.32637821216780144, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2365944385528564, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.685828685760498, + "num_tokens": 76761485.0, + "step": 2972 + }, + { + "epoch": 0.3264880298704151, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2494702339172363, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6988507509231567, + "num_tokens": 76790799.0, + "step": 2973 + }, + { + "epoch": 0.3265978475730288, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.5489704608917236, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6919349431991577, + "num_tokens": 76814140.0, + "step": 2974 + }, + { + "epoch": 0.32670766527564243, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.5508131980895996, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6923937201499939, + "num_tokens": 76836439.0, + "step": 2975 + }, + { + "epoch": 0.3268174829782561, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.578498601913452, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.704201340675354, + "num_tokens": 76857722.0, + "step": 2976 + }, + { + "epoch": 0.3269273006808698, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.45365834236145, + "learning_rate": 1e-06, + "loss": 1.1601, + "mean_token_accuracy": 0.6651996374130249, + "num_tokens": 76883370.0, + "step": 2977 + }, + { + "epoch": 0.3270371183834834, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.5574748516082764, + "learning_rate": 1e-06, + "loss": 1.0769, + "mean_token_accuracy": 0.6766926050186157, + "num_tokens": 76907397.0, + "step": 2978 + }, + { + "epoch": 0.3271469360860971, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.49733304977417, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.695501446723938, + "num_tokens": 76931225.0, + "step": 2979 + }, + { + "epoch": 0.3272567537887107, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.4144883155822754, + "learning_rate": 1e-06, + "loss": 1.1098, + "mean_token_accuracy": 0.6687714457511902, + "num_tokens": 76956296.0, + "step": 2980 + }, + { + "epoch": 0.3273665714913244, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.3683481216430664, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7159908413887024, + "num_tokens": 76979668.0, + "step": 2981 + }, + { + "epoch": 0.32747638919393807, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.788032054901123, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6994805335998535, + "num_tokens": 76999757.0, + "step": 2982 + }, + { + "epoch": 0.3275862068965517, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.026855230331421, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7097853422164917, + "num_tokens": 77031664.0, + "step": 2983 + }, + { + "epoch": 0.32769602459916536, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.8037919998168945, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6963224411010742, + "num_tokens": 77050012.0, + "step": 2984 + }, + { + "epoch": 0.32780584230177906, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.6031224727630615, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6948580741882324, + "num_tokens": 77071442.0, + "step": 2985 + }, + { + "epoch": 0.3279156600043927, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.401555299758911, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6899044513702393, + "num_tokens": 77098835.0, + "step": 2986 + }, + { + "epoch": 0.32802547770700635, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2612688541412354, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7115812301635742, + "num_tokens": 77125012.0, + "step": 2987 + }, + { + "epoch": 0.32813529540962005, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 32.34727478027344, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7250598669052124, + "num_tokens": 77148158.0, + "step": 2988 + }, + { + "epoch": 0.3282451131122337, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.460718870162964, + "learning_rate": 1e-06, + "loss": 1.0871, + "mean_token_accuracy": 0.6801236867904663, + "num_tokens": 77175883.0, + "step": 2989 + }, + { + "epoch": 0.32835493081484735, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.6329269409179688, + "learning_rate": 1e-06, + "loss": 1.1323, + "mean_token_accuracy": 0.6703384518623352, + "num_tokens": 77200164.0, + "step": 2990 + }, + { + "epoch": 0.328464748517461, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.4061126708984375, + "learning_rate": 1e-06, + "loss": 1.1167, + "mean_token_accuracy": 0.6763765811920166, + "num_tokens": 77226914.0, + "step": 2991 + }, + { + "epoch": 0.3285745662200747, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.389356851577759, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6950377225875854, + "num_tokens": 77253262.0, + "step": 2992 + }, + { + "epoch": 0.32868438392268834, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.384869337081909, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7301162481307983, + "num_tokens": 77280602.0, + "step": 2993 + }, + { + "epoch": 0.328794201625302, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.196169853210449, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7009159326553345, + "num_tokens": 77308632.0, + "step": 2994 + }, + { + "epoch": 0.3289040193279157, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.568206787109375, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6943866014480591, + "num_tokens": 77331211.0, + "step": 2995 + }, + { + "epoch": 0.32901383703052933, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.3229353427886963, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7158424854278564, + "num_tokens": 77358266.0, + "step": 2996 + }, + { + "epoch": 0.329123654733143, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.491748571395874, + "learning_rate": 1e-06, + "loss": 1.1112, + "mean_token_accuracy": 0.6862716674804688, + "num_tokens": 77382026.0, + "step": 2997 + }, + { + "epoch": 0.3292334724357566, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.094552516937256, + "learning_rate": 1e-06, + "loss": 1.1465, + "mean_token_accuracy": 0.672444224357605, + "num_tokens": 77420022.0, + "step": 2998 + }, + { + "epoch": 0.3293432901383703, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2638540267944336, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6898224353790283, + "num_tokens": 77451856.0, + "step": 2999 + }, + { + "epoch": 0.32945310784098397, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.260490894317627, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6814484596252441, + "num_tokens": 77481883.0, + "step": 3000 + }, + { + "epoch": 0.3295629255435976, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.702439546585083, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.717500627040863, + "num_tokens": 77501260.0, + "step": 3001 + }, + { + "epoch": 0.32967274324621126, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.583786964416504, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7034134268760681, + "num_tokens": 77522736.0, + "step": 3002 + }, + { + "epoch": 0.32978256094882497, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.309028387069702, + "learning_rate": 1e-06, + "loss": 1.1506, + "mean_token_accuracy": 0.6595946550369263, + "num_tokens": 77552652.0, + "step": 3003 + }, + { + "epoch": 0.3298923786514386, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.295339584350586, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7255032062530518, + "num_tokens": 77577360.0, + "step": 3004 + }, + { + "epoch": 0.33000219635405226, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.145307779312134, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7094792723655701, + "num_tokens": 77606752.0, + "step": 3005 + }, + { + "epoch": 0.33011201405666596, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.5569660663604736, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7083727717399597, + "num_tokens": 77628709.0, + "step": 3006 + }, + { + "epoch": 0.3302218317592796, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.2634782791137695, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6807090044021606, + "num_tokens": 77656252.0, + "step": 3007 + }, + { + "epoch": 0.33033164946189325, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.427279472351074, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6994594931602478, + "num_tokens": 77681961.0, + "step": 3008 + }, + { + "epoch": 0.3304414671645069, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.3451855182647705, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6995673179626465, + "num_tokens": 77709226.0, + "step": 3009 + }, + { + "epoch": 0.3305512848671206, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.8496532440185547, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7162744998931885, + "num_tokens": 77726517.0, + "step": 3010 + }, + { + "epoch": 0.33066110256973424, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.341155767440796, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6993274688720703, + "num_tokens": 77752521.0, + "step": 3011 + }, + { + "epoch": 0.3307709202723479, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.2787258625030518, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6966159343719482, + "num_tokens": 77779017.0, + "step": 3012 + }, + { + "epoch": 0.33088073797496154, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.4510412216186523, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6778364181518555, + "num_tokens": 77803179.0, + "step": 3013 + }, + { + "epoch": 0.33099055567757524, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.2246336936950684, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7142369747161865, + "num_tokens": 77830529.0, + "step": 3014 + }, + { + "epoch": 0.3311003733801889, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.422899007797241, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6961988806724548, + "num_tokens": 77857041.0, + "step": 3015 + }, + { + "epoch": 0.33121019108280253, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.550333023071289, + "learning_rate": 1e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.6800967454910278, + "num_tokens": 77881178.0, + "step": 3016 + }, + { + "epoch": 0.33132000878541623, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.3308868408203125, + "learning_rate": 1e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.681995153427124, + "num_tokens": 77910440.0, + "step": 3017 + }, + { + "epoch": 0.3314298264880299, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.238328456878662, + "learning_rate": 1e-06, + "loss": 1.1078, + "mean_token_accuracy": 0.6760222911834717, + "num_tokens": 77941588.0, + "step": 3018 + }, + { + "epoch": 0.3315396441906435, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.6219077110290527, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7134650945663452, + "num_tokens": 77962746.0, + "step": 3019 + }, + { + "epoch": 0.33164946189325717, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.6804683208465576, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6855409145355225, + "num_tokens": 77985228.0, + "step": 3020 + }, + { + "epoch": 0.33175927959587087, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.371927499771118, + "learning_rate": 1e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6740906238555908, + "num_tokens": 78015834.0, + "step": 3021 + }, + { + "epoch": 0.3318690972984845, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.592414140701294, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7000389099121094, + "num_tokens": 78038463.0, + "step": 3022 + }, + { + "epoch": 0.33197891500109816, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.255666494369507, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6945761442184448, + "num_tokens": 78068067.0, + "step": 3023 + }, + { + "epoch": 0.33208873270371186, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.3863160610198975, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6978263258934021, + "num_tokens": 78099716.0, + "step": 3024 + }, + { + "epoch": 0.3321985504063255, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.3910841941833496, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7277899980545044, + "num_tokens": 78123803.0, + "step": 3025 + }, + { + "epoch": 0.33230836810893916, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.440829038619995, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7081616520881653, + "num_tokens": 78148900.0, + "step": 3026 + }, + { + "epoch": 0.3324181858115528, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.6825430393218994, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7020115852355957, + "num_tokens": 78172082.0, + "step": 3027 + }, + { + "epoch": 0.3325280035141665, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.36187481880188, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7137514352798462, + "num_tokens": 78197140.0, + "step": 3028 + }, + { + "epoch": 0.33263782121678015, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.128917694091797, + "learning_rate": 1e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6761205792427063, + "num_tokens": 78229344.0, + "step": 3029 + }, + { + "epoch": 0.3327476389193938, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.3363120555877686, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6952904462814331, + "num_tokens": 78257350.0, + "step": 3030 + }, + { + "epoch": 0.33285745662200744, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.276529550552368, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7083570957183838, + "num_tokens": 78286663.0, + "step": 3031 + }, + { + "epoch": 0.33296727432462114, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.1981418132781982, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.687078058719635, + "num_tokens": 78318532.0, + "step": 3032 + }, + { + "epoch": 0.3330770920272348, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.461688280105591, + "learning_rate": 1e-06, + "loss": 1.1177, + "mean_token_accuracy": 0.6709591150283813, + "num_tokens": 78342676.0, + "step": 3033 + }, + { + "epoch": 0.33318690972984844, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.5112202167510986, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.70295250415802, + "num_tokens": 78364760.0, + "step": 3034 + }, + { + "epoch": 0.33329672743246214, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.464989423751831, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6952642798423767, + "num_tokens": 78388601.0, + "step": 3035 + }, + { + "epoch": 0.3334065451350758, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.6673858165740967, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.694969117641449, + "num_tokens": 78415549.0, + "step": 3036 + }, + { + "epoch": 0.33351636283768943, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.3382484912872314, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7044101357460022, + "num_tokens": 78442685.0, + "step": 3037 + }, + { + "epoch": 0.3336261805403031, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.3724093437194824, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6962442398071289, + "num_tokens": 78469472.0, + "step": 3038 + }, + { + "epoch": 0.3337359982429168, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.140888214111328, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7124699354171753, + "num_tokens": 78501465.0, + "step": 3039 + }, + { + "epoch": 0.3338458159455304, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.4026732444763184, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7167883515357971, + "num_tokens": 78525782.0, + "step": 3040 + }, + { + "epoch": 0.33395563364814407, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.4139466285705566, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6983937621116638, + "num_tokens": 78550247.0, + "step": 3041 + }, + { + "epoch": 0.33406545135075777, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.261690139770508, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7044696807861328, + "num_tokens": 78576229.0, + "step": 3042 + }, + { + "epoch": 0.3341752690533714, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2515764236450195, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6948840618133545, + "num_tokens": 78603794.0, + "step": 3043 + }, + { + "epoch": 0.33428508675598506, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.554924964904785, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7007368803024292, + "num_tokens": 78624317.0, + "step": 3044 + }, + { + "epoch": 0.3343949044585987, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.3944947719573975, + "learning_rate": 1e-06, + "loss": 1.078, + "mean_token_accuracy": 0.681306004524231, + "num_tokens": 78649693.0, + "step": 3045 + }, + { + "epoch": 0.3345047221612124, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2201101779937744, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7262824177742004, + "num_tokens": 78676792.0, + "step": 3046 + }, + { + "epoch": 0.33461453986382605, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.5492427349090576, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7125256061553955, + "num_tokens": 78698812.0, + "step": 3047 + }, + { + "epoch": 0.3347243575664397, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.547938346862793, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6966478824615479, + "num_tokens": 78723823.0, + "step": 3048 + }, + { + "epoch": 0.33483417526905335, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2937304973602295, + "learning_rate": 1e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.6883600950241089, + "num_tokens": 78751359.0, + "step": 3049 + }, + { + "epoch": 0.33494399297166705, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2376551628112793, + "learning_rate": 1e-06, + "loss": 1.1243, + "mean_token_accuracy": 0.6694768667221069, + "num_tokens": 78778325.0, + "step": 3050 + }, + { + "epoch": 0.3350538106742807, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.59260630607605, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7210583686828613, + "num_tokens": 78800096.0, + "step": 3051 + }, + { + "epoch": 0.33516362837689434, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.758152723312378, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7025072574615479, + "num_tokens": 78820036.0, + "step": 3052 + }, + { + "epoch": 0.33527344607950804, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.295719623565674, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6958441138267517, + "num_tokens": 78845369.0, + "step": 3053 + }, + { + "epoch": 0.3353832637821217, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2658350467681885, + "learning_rate": 1e-06, + "loss": 1.1193, + "mean_token_accuracy": 0.6740327477455139, + "num_tokens": 78878373.0, + "step": 3054 + }, + { + "epoch": 0.33549308148473533, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.5293726921081543, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6948933601379395, + "num_tokens": 78900627.0, + "step": 3055 + }, + { + "epoch": 0.335602899187349, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.1697773933410645, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6915802955627441, + "num_tokens": 78932358.0, + "step": 3056 + }, + { + "epoch": 0.3357127168899627, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.5005645751953125, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6990864872932434, + "num_tokens": 78954929.0, + "step": 3057 + }, + { + "epoch": 0.3358225345925763, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.11643648147583, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7128601670265198, + "num_tokens": 78984236.0, + "step": 3058 + }, + { + "epoch": 0.33593235229519, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.214057207107544, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6941692233085632, + "num_tokens": 79013737.0, + "step": 3059 + }, + { + "epoch": 0.3360421699978036, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.111095905303955, + "learning_rate": 1e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.6788477301597595, + "num_tokens": 79047783.0, + "step": 3060 + }, + { + "epoch": 0.3361519877004173, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.5857417583465576, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6994221806526184, + "num_tokens": 79069512.0, + "step": 3061 + }, + { + "epoch": 0.33626180540303097, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2129783630371094, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7214305400848389, + "num_tokens": 79098817.0, + "step": 3062 + }, + { + "epoch": 0.3363716231056446, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.6515262126922607, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6849035620689392, + "num_tokens": 79119523.0, + "step": 3063 + }, + { + "epoch": 0.3364814408082583, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.2600884437561035, + "learning_rate": 1e-06, + "loss": 1.1164, + "mean_token_accuracy": 0.674584686756134, + "num_tokens": 79148617.0, + "step": 3064 + }, + { + "epoch": 0.33659125851087196, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.378277063369751, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7024935483932495, + "num_tokens": 79174100.0, + "step": 3065 + }, + { + "epoch": 0.3367010762134856, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.360818386077881, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.708257794380188, + "num_tokens": 79200804.0, + "step": 3066 + }, + { + "epoch": 0.33681089391609925, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 2.51060152053833, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7033665180206299, + "num_tokens": 79223988.0, + "step": 3067 + }, + { + "epoch": 0.33692071161871295, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.4590413570404053, + "learning_rate": 1e-06, + "loss": 1.1064, + "mean_token_accuracy": 0.6780437231063843, + "num_tokens": 79248404.0, + "step": 3068 + }, + { + "epoch": 0.3370305293213266, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.2029638290405273, + "learning_rate": 1e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.683827817440033, + "num_tokens": 79277072.0, + "step": 3069 + }, + { + "epoch": 0.33714034702394025, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.3152904510498047, + "learning_rate": 1e-06, + "loss": 1.1226, + "mean_token_accuracy": 0.6718670129776001, + "num_tokens": 79304757.0, + "step": 3070 + }, + { + "epoch": 0.33725016472655395, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.0274181365966797, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6880829334259033, + "num_tokens": 79339350.0, + "step": 3071 + }, + { + "epoch": 0.3373599824291676, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.276005506515503, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6811155080795288, + "num_tokens": 79369818.0, + "step": 3072 + }, + { + "epoch": 0.33746980013178124, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.1551601886749268, + "learning_rate": 1e-06, + "loss": 1.1094, + "mean_token_accuracy": 0.6812498569488525, + "num_tokens": 79401881.0, + "step": 3073 + }, + { + "epoch": 0.3375796178343949, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.641467809677124, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7025416493415833, + "num_tokens": 79420457.0, + "step": 3074 + }, + { + "epoch": 0.3376894355370086, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.2928881645202637, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6979837417602539, + "num_tokens": 79450330.0, + "step": 3075 + }, + { + "epoch": 0.33779925323962223, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.3254222869873047, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.695449948310852, + "num_tokens": 79477626.0, + "step": 3076 + }, + { + "epoch": 0.3379090709422359, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 2.5109975337982178, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6938396692276001, + "num_tokens": 79500703.0, + "step": 3077 + }, + { + "epoch": 0.3380188886448495, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.4085044860839844, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6846994161605835, + "num_tokens": 79526708.0, + "step": 3078 + }, + { + "epoch": 0.3381287063474632, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.047062397003174, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7002630233764648, + "num_tokens": 79561806.0, + "step": 3079 + }, + { + "epoch": 0.33823852405007687, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.398280382156372, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7044366598129272, + "num_tokens": 79587764.0, + "step": 3080 + }, + { + "epoch": 0.3383483417526905, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.1996562480926514, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.710614025592804, + "num_tokens": 79616144.0, + "step": 3081 + }, + { + "epoch": 0.3384581594553042, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.46513032913208, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7058779001235962, + "num_tokens": 79638796.0, + "step": 3082 + }, + { + "epoch": 0.33856797715791787, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.398087739944458, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7178139686584473, + "num_tokens": 79662566.0, + "step": 3083 + }, + { + "epoch": 0.3386777948605315, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.3038558959960938, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7053583264350891, + "num_tokens": 79691734.0, + "step": 3084 + }, + { + "epoch": 0.33878761256314516, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.2960193157196045, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6993764042854309, + "num_tokens": 79720234.0, + "step": 3085 + }, + { + "epoch": 0.33889743026575886, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.391463279724121, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6898127794265747, + "num_tokens": 79744274.0, + "step": 3086 + }, + { + "epoch": 0.3390072479683725, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.5074222087860107, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6947657465934753, + "num_tokens": 79771274.0, + "step": 3087 + }, + { + "epoch": 0.33911706567098615, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.413588762283325, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6828027963638306, + "num_tokens": 79797808.0, + "step": 3088 + }, + { + "epoch": 0.3392268833735998, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.5183370113372803, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7035754919052124, + "num_tokens": 79818490.0, + "step": 3089 + }, + { + "epoch": 0.3393367010762135, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.4262285232543945, + "learning_rate": 1e-06, + "loss": 1.0889, + "mean_token_accuracy": 0.6783777475357056, + "num_tokens": 79846912.0, + "step": 3090 + }, + { + "epoch": 0.33944651877882714, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 3.0617098808288574, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7160819172859192, + "num_tokens": 79862416.0, + "step": 3091 + }, + { + "epoch": 0.3395563364814408, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.3884432315826416, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6854654550552368, + "num_tokens": 79888186.0, + "step": 3092 + }, + { + "epoch": 0.3396661541840545, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.302518129348755, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6899541616439819, + "num_tokens": 79914312.0, + "step": 3093 + }, + { + "epoch": 0.33977597188666814, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 2.4551315307617188, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7070513367652893, + "num_tokens": 79938231.0, + "step": 3094 + }, + { + "epoch": 0.3398857895892818, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.1739954948425293, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7142348289489746, + "num_tokens": 79965742.0, + "step": 3095 + }, + { + "epoch": 0.33999560729189543, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.123020887374878, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6994000673294067, + "num_tokens": 80000024.0, + "step": 3096 + }, + { + "epoch": 0.34010542499450913, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.0991904735565186, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6981183290481567, + "num_tokens": 80032041.0, + "step": 3097 + }, + { + "epoch": 0.3402152426971228, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.3341050148010254, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6848214268684387, + "num_tokens": 80057633.0, + "step": 3098 + }, + { + "epoch": 0.3403250603997364, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.34637188911438, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6958058476448059, + "num_tokens": 80083175.0, + "step": 3099 + }, + { + "epoch": 0.3404348781023501, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.2785117626190186, + "learning_rate": 1e-06, + "loss": 1.1101, + "mean_token_accuracy": 0.679787278175354, + "num_tokens": 80112222.0, + "step": 3100 + }, + { + "epoch": 0.34054469580496377, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.9558801651000977, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7049394845962524, + "num_tokens": 80129502.0, + "step": 3101 + }, + { + "epoch": 0.3406545135075774, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.5334043502807617, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7121919989585876, + "num_tokens": 80152719.0, + "step": 3102 + }, + { + "epoch": 0.34076433121019106, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.3994486331939697, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7125494480133057, + "num_tokens": 80177076.0, + "step": 3103 + }, + { + "epoch": 0.34087414891280476, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.2626891136169434, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6954662799835205, + "num_tokens": 80204022.0, + "step": 3104 + }, + { + "epoch": 0.3409839666154184, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.3847882747650146, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7102022171020508, + "num_tokens": 80229727.0, + "step": 3105 + }, + { + "epoch": 0.34109378431803206, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.4120266437530518, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7179015874862671, + "num_tokens": 80251886.0, + "step": 3106 + }, + { + "epoch": 0.3412036020206457, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.1538264751434326, + "learning_rate": 1e-06, + "loss": 1.0835, + "mean_token_accuracy": 0.6805291175842285, + "num_tokens": 80283712.0, + "step": 3107 + }, + { + "epoch": 0.3413134197232594, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.315890312194824, + "learning_rate": 1e-06, + "loss": 1.1064, + "mean_token_accuracy": 0.6820006370544434, + "num_tokens": 80311758.0, + "step": 3108 + }, + { + "epoch": 0.34142323742587305, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.7083098888397217, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6946664452552795, + "num_tokens": 80333423.0, + "step": 3109 + }, + { + "epoch": 0.3415330551284867, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 2.4717421531677246, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6816858053207397, + "num_tokens": 80358027.0, + "step": 3110 + }, + { + "epoch": 0.3416428728311004, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.2095353603363037, + "learning_rate": 1e-06, + "loss": 1.0909, + "mean_token_accuracy": 0.6965354681015015, + "num_tokens": 80385582.0, + "step": 3111 + }, + { + "epoch": 0.34175269053371404, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.2631516456604004, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.714957594871521, + "num_tokens": 80415533.0, + "step": 3112 + }, + { + "epoch": 0.3418625082363277, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.4460980892181396, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7151967883110046, + "num_tokens": 80439223.0, + "step": 3113 + }, + { + "epoch": 0.34197232593894134, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.6461598873138428, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7003395557403564, + "num_tokens": 80462023.0, + "step": 3114 + }, + { + "epoch": 0.34208214364155504, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.5707027912139893, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6948234438896179, + "num_tokens": 80484919.0, + "step": 3115 + }, + { + "epoch": 0.3421919613441687, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.2971811294555664, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.721915602684021, + "num_tokens": 80508170.0, + "step": 3116 + }, + { + "epoch": 0.34230177904678233, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.287745237350464, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6862508654594421, + "num_tokens": 80539222.0, + "step": 3117 + }, + { + "epoch": 0.34241159674939603, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 4.440463066101074, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7109105587005615, + "num_tokens": 80562836.0, + "step": 3118 + }, + { + "epoch": 0.3425214144520097, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.4224038124084473, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6997863054275513, + "num_tokens": 80588438.0, + "step": 3119 + }, + { + "epoch": 0.3426312321546233, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.414515256881714, + "learning_rate": 1e-06, + "loss": 1.105, + "mean_token_accuracy": 0.6795390844345093, + "num_tokens": 80615939.0, + "step": 3120 + }, + { + "epoch": 0.34274104985723697, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.3249950408935547, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6887739300727844, + "num_tokens": 80642335.0, + "step": 3121 + }, + { + "epoch": 0.34285086755985067, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.4589874744415283, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6913806200027466, + "num_tokens": 80665781.0, + "step": 3122 + }, + { + "epoch": 0.3429606852624643, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.107842445373535, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.696574330329895, + "num_tokens": 80696047.0, + "step": 3123 + }, + { + "epoch": 0.34307050296507796, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.5827667713165283, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.704720675945282, + "num_tokens": 80719122.0, + "step": 3124 + }, + { + "epoch": 0.3431803206676916, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.1844325065612793, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6975316405296326, + "num_tokens": 80747066.0, + "step": 3125 + }, + { + "epoch": 0.3432901383703053, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.3330929279327393, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.7006101608276367, + "num_tokens": 80775529.0, + "step": 3126 + }, + { + "epoch": 0.34339995607291895, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.225768804550171, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6867858171463013, + "num_tokens": 80806422.0, + "step": 3127 + }, + { + "epoch": 0.3435097737755326, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.39751935005188, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6995342969894409, + "num_tokens": 80829206.0, + "step": 3128 + }, + { + "epoch": 0.3436195914781463, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.172224760055542, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6891758441925049, + "num_tokens": 80859251.0, + "step": 3129 + }, + { + "epoch": 0.34372940918075995, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.630559206008911, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7055259943008423, + "num_tokens": 80879585.0, + "step": 3130 + }, + { + "epoch": 0.3438392268833736, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.8833794593811035, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7153267860412598, + "num_tokens": 80896459.0, + "step": 3131 + }, + { + "epoch": 0.34394904458598724, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.6740517616271973, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7072331309318542, + "num_tokens": 80916349.0, + "step": 3132 + }, + { + "epoch": 0.34405886228860094, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.538987636566162, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7127243280410767, + "num_tokens": 80940274.0, + "step": 3133 + }, + { + "epoch": 0.3441686799912146, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.320882797241211, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6947526931762695, + "num_tokens": 80967458.0, + "step": 3134 + }, + { + "epoch": 0.34427849769382823, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.6799044609069824, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7097629308700562, + "num_tokens": 80987311.0, + "step": 3135 + }, + { + "epoch": 0.3443883153964419, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.616776943206787, + "learning_rate": 1e-06, + "loss": 1.1494, + "mean_token_accuracy": 0.6641384363174438, + "num_tokens": 81009634.0, + "step": 3136 + }, + { + "epoch": 0.3444981330990556, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.7773187160491943, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7018826007843018, + "num_tokens": 81029324.0, + "step": 3137 + }, + { + "epoch": 0.3446079508016692, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.295621871948242, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6869965195655823, + "num_tokens": 81060117.0, + "step": 3138 + }, + { + "epoch": 0.3447177685042829, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.5225038528442383, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7039535045623779, + "num_tokens": 81084163.0, + "step": 3139 + }, + { + "epoch": 0.3448275862068966, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.2619898319244385, + "learning_rate": 1e-06, + "loss": 1.0896, + "mean_token_accuracy": 0.6826743483543396, + "num_tokens": 81115321.0, + "step": 3140 + }, + { + "epoch": 0.3449374039095102, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.630403757095337, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7052448391914368, + "num_tokens": 81136307.0, + "step": 3141 + }, + { + "epoch": 0.34504722161212387, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.386632204055786, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6868963837623596, + "num_tokens": 81163479.0, + "step": 3142 + }, + { + "epoch": 0.3451570393147375, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.6779274940490723, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6858968734741211, + "num_tokens": 81188105.0, + "step": 3143 + }, + { + "epoch": 0.3452668570173512, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.359818696975708, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6983290910720825, + "num_tokens": 81217529.0, + "step": 3144 + }, + { + "epoch": 0.34537667471996486, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.1820666790008545, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7057240605354309, + "num_tokens": 81245442.0, + "step": 3145 + }, + { + "epoch": 0.3454864924225785, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.240750789642334, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7111929059028625, + "num_tokens": 81273374.0, + "step": 3146 + }, + { + "epoch": 0.3455963101251922, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.321369171142578, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7041676044464111, + "num_tokens": 81299150.0, + "step": 3147 + }, + { + "epoch": 0.34570612782780585, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.4745500087738037, + "learning_rate": 1e-06, + "loss": 1.0867, + "mean_token_accuracy": 0.6837316751480103, + "num_tokens": 81323114.0, + "step": 3148 + }, + { + "epoch": 0.3458159455304195, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.7772839069366455, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7202337980270386, + "num_tokens": 81343642.0, + "step": 3149 + }, + { + "epoch": 0.34592576323303315, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 2.3482770919799805, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7140212059020996, + "num_tokens": 81371186.0, + "step": 3150 + }, + { + "epoch": 0.34603558093564685, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.571702241897583, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6991863250732422, + "num_tokens": 81398073.0, + "step": 3151 + }, + { + "epoch": 0.3461453986382605, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.2203197479248047, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7049428224563599, + "num_tokens": 81426255.0, + "step": 3152 + }, + { + "epoch": 0.34625521634087414, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.371473789215088, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6918966770172119, + "num_tokens": 81452329.0, + "step": 3153 + }, + { + "epoch": 0.3463650340434878, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.3974826335906982, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7254167795181274, + "num_tokens": 81475750.0, + "step": 3154 + }, + { + "epoch": 0.3464748517461015, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.790022611618042, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.697043776512146, + "num_tokens": 81499098.0, + "step": 3155 + }, + { + "epoch": 0.34658466944871513, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.3049418926239014, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6835493445396423, + "num_tokens": 81525567.0, + "step": 3156 + }, + { + "epoch": 0.3466944871513288, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.422335147857666, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6714122295379639, + "num_tokens": 81552757.0, + "step": 3157 + }, + { + "epoch": 0.3468043048539425, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.355466365814209, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6887389421463013, + "num_tokens": 81579060.0, + "step": 3158 + }, + { + "epoch": 0.3469141225565561, + "ewc_loss": 9.000301361083984e-06, + "grad_norm": 2.2477807998657227, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6832886934280396, + "num_tokens": 81608542.0, + "step": 3159 + }, + { + "epoch": 0.34702394025916977, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.190717935562134, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7012151479721069, + "num_tokens": 81639879.0, + "step": 3160 + }, + { + "epoch": 0.3471337579617834, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.3194448947906494, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.713715672492981, + "num_tokens": 81664867.0, + "step": 3161 + }, + { + "epoch": 0.3472435756643971, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.532139301300049, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6797343492507935, + "num_tokens": 81691172.0, + "step": 3162 + }, + { + "epoch": 0.34735339336701077, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.1012890338897705, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7055968046188354, + "num_tokens": 81722600.0, + "step": 3163 + }, + { + "epoch": 0.3474632110696244, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 3.070366621017456, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7212729454040527, + "num_tokens": 81737797.0, + "step": 3164 + }, + { + "epoch": 0.34757302877223806, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.609851598739624, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7106611728668213, + "num_tokens": 81757397.0, + "step": 3165 + }, + { + "epoch": 0.34768284647485176, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.5790271759033203, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7159138917922974, + "num_tokens": 81778433.0, + "step": 3166 + }, + { + "epoch": 0.3477926641774654, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.228747606277466, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.691146731376648, + "num_tokens": 81808813.0, + "step": 3167 + }, + { + "epoch": 0.34790248188007905, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.336869478225708, + "learning_rate": 1e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.6812396049499512, + "num_tokens": 81835433.0, + "step": 3168 + }, + { + "epoch": 0.34801229958269275, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.6583175659179688, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7065110206604004, + "num_tokens": 81855762.0, + "step": 3169 + }, + { + "epoch": 0.3481221172853064, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.461926221847534, + "learning_rate": 1e-06, + "loss": 1.1112, + "mean_token_accuracy": 0.670741081237793, + "num_tokens": 81880430.0, + "step": 3170 + }, + { + "epoch": 0.34823193498792004, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.4491991996765137, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7146524786949158, + "num_tokens": 81903500.0, + "step": 3171 + }, + { + "epoch": 0.3483417526905337, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.254042387008667, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7107153534889221, + "num_tokens": 81935057.0, + "step": 3172 + }, + { + "epoch": 0.3484515703931474, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.466108798980713, + "learning_rate": 1e-06, + "loss": 1.0934, + "mean_token_accuracy": 0.6749104857444763, + "num_tokens": 81959076.0, + "step": 3173 + }, + { + "epoch": 0.34856138809576104, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.5437331199645996, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7068870067596436, + "num_tokens": 81981974.0, + "step": 3174 + }, + { + "epoch": 0.3486712057983747, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.0812461376190186, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6853300929069519, + "num_tokens": 82015685.0, + "step": 3175 + }, + { + "epoch": 0.3487810235009884, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.6899139881134033, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7059975862503052, + "num_tokens": 82037034.0, + "step": 3176 + }, + { + "epoch": 0.34889084120360203, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.4707133769989014, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.709151029586792, + "num_tokens": 82060036.0, + "step": 3177 + }, + { + "epoch": 0.3490006589062157, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.382735252380371, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6963204741477966, + "num_tokens": 82086924.0, + "step": 3178 + }, + { + "epoch": 0.3491104766088293, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.4620537757873535, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7045553922653198, + "num_tokens": 82110692.0, + "step": 3179 + }, + { + "epoch": 0.349220294311443, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.6201415061950684, + "learning_rate": 1e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6886829137802124, + "num_tokens": 82133124.0, + "step": 3180 + }, + { + "epoch": 0.34933011201405667, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.2273194789886475, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7328234314918518, + "num_tokens": 82160041.0, + "step": 3181 + }, + { + "epoch": 0.3494399297166703, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 2.864842176437378, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7041663527488708, + "num_tokens": 82183128.0, + "step": 3182 + }, + { + "epoch": 0.34954974741928396, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.4700891971588135, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.7025750875473022, + "num_tokens": 82206128.0, + "step": 3183 + }, + { + "epoch": 0.34965956512189766, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.6485254764556885, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7035776376724243, + "num_tokens": 82227187.0, + "step": 3184 + }, + { + "epoch": 0.3497693828245113, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 2.4541213512420654, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7095891833305359, + "num_tokens": 82250941.0, + "step": 3185 + }, + { + "epoch": 0.34987920052712496, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 2.2635045051574707, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6922872066497803, + "num_tokens": 82279220.0, + "step": 3186 + }, + { + "epoch": 0.34998901822973866, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 2.2497336864471436, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.686346173286438, + "num_tokens": 82308794.0, + "step": 3187 + }, + { + "epoch": 0.3500988359323523, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 2.4443860054016113, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.7026388645172119, + "num_tokens": 82333167.0, + "step": 3188 + }, + { + "epoch": 0.35020865363496595, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 2.5429418087005615, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7150360345840454, + "num_tokens": 82356761.0, + "step": 3189 + }, + { + "epoch": 0.3503184713375796, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5399396419525146, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.7041598558425903, + "num_tokens": 82377746.0, + "step": 3190 + }, + { + "epoch": 0.3504282890401933, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3950138092041016, + "learning_rate": 1e-06, + "loss": 1.144, + "mean_token_accuracy": 0.6695104837417603, + "num_tokens": 82404722.0, + "step": 3191 + }, + { + "epoch": 0.35053810674280694, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3167176246643066, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7046523094177246, + "num_tokens": 82431008.0, + "step": 3192 + }, + { + "epoch": 0.3506479244454206, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.262749671936035, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6820963621139526, + "num_tokens": 82459447.0, + "step": 3193 + }, + { + "epoch": 0.3507577421480343, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2851428985595703, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6883793473243713, + "num_tokens": 82485581.0, + "step": 3194 + }, + { + "epoch": 0.35086755985064794, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.216960906982422, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7029799818992615, + "num_tokens": 82512694.0, + "step": 3195 + }, + { + "epoch": 0.3509773775532616, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.367157220840454, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7146037817001343, + "num_tokens": 82537247.0, + "step": 3196 + }, + { + "epoch": 0.35108719525587523, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5628678798675537, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7161219716072083, + "num_tokens": 82558570.0, + "step": 3197 + }, + { + "epoch": 0.35119701295848893, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.272005319595337, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6915153861045837, + "num_tokens": 82587299.0, + "step": 3198 + }, + { + "epoch": 0.3513068306611026, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.076338768005371, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.691666841506958, + "num_tokens": 82619644.0, + "step": 3199 + }, + { + "epoch": 0.3514166483637162, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.45732045173645, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.705795407295227, + "num_tokens": 82644335.0, + "step": 3200 + }, + { + "epoch": 0.35152646606632987, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1229326725006104, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6923545002937317, + "num_tokens": 82674506.0, + "step": 3201 + }, + { + "epoch": 0.35163628376894357, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.312232732772827, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6879464387893677, + "num_tokens": 82702472.0, + "step": 3202 + }, + { + "epoch": 0.3517461014715572, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2713863849639893, + "learning_rate": 1e-06, + "loss": 1.0726, + "mean_token_accuracy": 0.678857684135437, + "num_tokens": 82730656.0, + "step": 3203 + }, + { + "epoch": 0.35185591917417086, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.216578245162964, + "learning_rate": 1e-06, + "loss": 1.115, + "mean_token_accuracy": 0.6724871397018433, + "num_tokens": 82761816.0, + "step": 3204 + }, + { + "epoch": 0.35196573687678456, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.4564828872680664, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.710929811000824, + "num_tokens": 82785818.0, + "step": 3205 + }, + { + "epoch": 0.3520755545793982, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2996761798858643, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7014755606651306, + "num_tokens": 82813666.0, + "step": 3206 + }, + { + "epoch": 0.35218537228201185, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.378645181655884, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.7064274549484253, + "num_tokens": 82840296.0, + "step": 3207 + }, + { + "epoch": 0.3522951899846255, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 3.0316786766052246, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7030764222145081, + "num_tokens": 82857720.0, + "step": 3208 + }, + { + "epoch": 0.3524050076872392, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5260276794433594, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6828338503837585, + "num_tokens": 82882989.0, + "step": 3209 + }, + { + "epoch": 0.35251482538985285, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3378281593322754, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7249609231948853, + "num_tokens": 82908373.0, + "step": 3210 + }, + { + "epoch": 0.3526246430924665, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1179006099700928, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7037779092788696, + "num_tokens": 82939675.0, + "step": 3211 + }, + { + "epoch": 0.35273446079508014, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5913398265838623, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.7059270739555359, + "num_tokens": 82964508.0, + "step": 3212 + }, + { + "epoch": 0.35284427849769384, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.468552589416504, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7206226587295532, + "num_tokens": 82986542.0, + "step": 3213 + }, + { + "epoch": 0.3529540962003075, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.409423351287842, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7016847729682922, + "num_tokens": 83010692.0, + "step": 3214 + }, + { + "epoch": 0.35306391390292113, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.302105665206909, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7175278067588806, + "num_tokens": 83037047.0, + "step": 3215 + }, + { + "epoch": 0.35317373160553484, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2774999141693115, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6987572908401489, + "num_tokens": 83063122.0, + "step": 3216 + }, + { + "epoch": 0.3532835493081485, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.319042921066284, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6957949995994568, + "num_tokens": 83091684.0, + "step": 3217 + }, + { + "epoch": 0.3533933670107621, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.303455352783203, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6962566375732422, + "num_tokens": 83119244.0, + "step": 3218 + }, + { + "epoch": 0.3535031847133758, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.6864821910858154, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.7076427936553955, + "num_tokens": 83141800.0, + "step": 3219 + }, + { + "epoch": 0.3536130024159895, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3252158164978027, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6989454030990601, + "num_tokens": 83170081.0, + "step": 3220 + }, + { + "epoch": 0.3537228201186031, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.64099383354187, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7011113166809082, + "num_tokens": 83190689.0, + "step": 3221 + }, + { + "epoch": 0.35383263782121677, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.307910442352295, + "learning_rate": 1e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6769865155220032, + "num_tokens": 83223907.0, + "step": 3222 + }, + { + "epoch": 0.35394245552383047, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.6697499752044678, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6964185237884521, + "num_tokens": 83245462.0, + "step": 3223 + }, + { + "epoch": 0.3540522732264441, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.6285805702209473, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7051622271537781, + "num_tokens": 83265293.0, + "step": 3224 + }, + { + "epoch": 0.35416209092905776, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.338428497314453, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7104296684265137, + "num_tokens": 83291068.0, + "step": 3225 + }, + { + "epoch": 0.3542719086316714, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.46939754486084, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6901905536651611, + "num_tokens": 83318510.0, + "step": 3226 + }, + { + "epoch": 0.3543817263342851, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.016281843185425, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.678521454334259, + "num_tokens": 83353576.0, + "step": 3227 + }, + { + "epoch": 0.35449154403689875, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.495914936065674, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6929425597190857, + "num_tokens": 83376074.0, + "step": 3228 + }, + { + "epoch": 0.3546013617395124, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.299102783203125, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.699613094329834, + "num_tokens": 83401544.0, + "step": 3229 + }, + { + "epoch": 0.35471117944212605, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1131253242492676, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6961206793785095, + "num_tokens": 83433777.0, + "step": 3230 + }, + { + "epoch": 0.35482099714473975, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3205902576446533, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6892980337142944, + "num_tokens": 83459923.0, + "step": 3231 + }, + { + "epoch": 0.3549308148473534, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3072168827056885, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6771845817565918, + "num_tokens": 83487301.0, + "step": 3232 + }, + { + "epoch": 0.35504063254996704, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.653780221939087, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7093451619148254, + "num_tokens": 83507683.0, + "step": 3233 + }, + { + "epoch": 0.35515045025258074, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.47007417678833, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6822936534881592, + "num_tokens": 83533092.0, + "step": 3234 + }, + { + "epoch": 0.3552602679551944, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3624777793884277, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.686279296875, + "num_tokens": 83562221.0, + "step": 3235 + }, + { + "epoch": 0.35537008565780803, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3584794998168945, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7038084268569946, + "num_tokens": 83587927.0, + "step": 3236 + }, + { + "epoch": 0.3554799033604217, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1037449836730957, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6941146850585938, + "num_tokens": 83619553.0, + "step": 3237 + }, + { + "epoch": 0.3555897210630354, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.0756471157073975, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7230082750320435, + "num_tokens": 83652859.0, + "step": 3238 + }, + { + "epoch": 0.355699538765649, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2450170516967773, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6912691593170166, + "num_tokens": 83679314.0, + "step": 3239 + }, + { + "epoch": 0.35580935646826267, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2865662574768066, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6954994201660156, + "num_tokens": 83704588.0, + "step": 3240 + }, + { + "epoch": 0.3559191741708763, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1321804523468018, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7022482752799988, + "num_tokens": 83732073.0, + "step": 3241 + }, + { + "epoch": 0.35602899187349, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.204641103744507, + "learning_rate": 1e-06, + "loss": 1.0837, + "mean_token_accuracy": 0.6793861389160156, + "num_tokens": 83761233.0, + "step": 3242 + }, + { + "epoch": 0.35613880957610367, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.4352216720581055, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6740660667419434, + "num_tokens": 83789309.0, + "step": 3243 + }, + { + "epoch": 0.3562486272787173, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.2556164264678955, + "learning_rate": 1e-06, + "loss": 1.1021, + "mean_token_accuracy": 0.683262050151825, + "num_tokens": 83822253.0, + "step": 3244 + }, + { + "epoch": 0.356358444981331, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.153542995452881, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7053143382072449, + "num_tokens": 83850718.0, + "step": 3245 + }, + { + "epoch": 0.35646826268394466, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.4138805866241455, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6963824033737183, + "num_tokens": 83874658.0, + "step": 3246 + }, + { + "epoch": 0.3565780803865583, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.4789538383483887, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.721182107925415, + "num_tokens": 83899291.0, + "step": 3247 + }, + { + "epoch": 0.35668789808917195, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.3900246620178223, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7200886011123657, + "num_tokens": 83922590.0, + "step": 3248 + }, + { + "epoch": 0.35679771579178565, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.29764461517334, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6994056701660156, + "num_tokens": 83950687.0, + "step": 3249 + }, + { + "epoch": 0.3569075334943993, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.242595911026001, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6947674751281738, + "num_tokens": 83977564.0, + "step": 3250 + }, + { + "epoch": 0.35701735119701294, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5437397956848145, + "learning_rate": 1e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.6807182431221008, + "num_tokens": 84003184.0, + "step": 3251 + }, + { + "epoch": 0.35712716889962665, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.1511988639831543, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6989978551864624, + "num_tokens": 84031308.0, + "step": 3252 + }, + { + "epoch": 0.3572369866022403, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.356250047683716, + "learning_rate": 1e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6749149560928345, + "num_tokens": 84057005.0, + "step": 3253 + }, + { + "epoch": 0.35734680430485394, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.302820920944214, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6858018636703491, + "num_tokens": 84084931.0, + "step": 3254 + }, + { + "epoch": 0.3574566220074676, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.573795795440674, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6829918622970581, + "num_tokens": 84108380.0, + "step": 3255 + }, + { + "epoch": 0.3575664397100813, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 2.5452847480773926, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7202911376953125, + "num_tokens": 84130087.0, + "step": 3256 + }, + { + "epoch": 0.35767625741269493, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.3636581897735596, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7040113210678101, + "num_tokens": 84154892.0, + "step": 3257 + }, + { + "epoch": 0.3577860751153086, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.569263219833374, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7085787057876587, + "num_tokens": 84176496.0, + "step": 3258 + }, + { + "epoch": 0.3578958928179222, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.6025338172912598, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6861191391944885, + "num_tokens": 84200797.0, + "step": 3259 + }, + { + "epoch": 0.3580057105205359, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.41082763671875, + "learning_rate": 1e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.6851485967636108, + "num_tokens": 84226155.0, + "step": 3260 + }, + { + "epoch": 0.35811552822314957, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.461303234100342, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7169928550720215, + "num_tokens": 84249673.0, + "step": 3261 + }, + { + "epoch": 0.3582253459257632, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.589768409729004, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6983437538146973, + "num_tokens": 84273724.0, + "step": 3262 + }, + { + "epoch": 0.3583351636283769, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.4687352180480957, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6912973523139954, + "num_tokens": 84299628.0, + "step": 3263 + }, + { + "epoch": 0.35844498133099056, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.5019116401672363, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7103945016860962, + "num_tokens": 84327527.0, + "step": 3264 + }, + { + "epoch": 0.3585547990336042, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.46114182472229, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6881442070007324, + "num_tokens": 84354889.0, + "step": 3265 + }, + { + "epoch": 0.35866461673621786, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.438880681991577, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6959226131439209, + "num_tokens": 84380522.0, + "step": 3266 + }, + { + "epoch": 0.35877443443883156, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.400618076324463, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7110888957977295, + "num_tokens": 84405266.0, + "step": 3267 + }, + { + "epoch": 0.3588842521414452, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.5026936531066895, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7174261808395386, + "num_tokens": 84428101.0, + "step": 3268 + }, + { + "epoch": 0.35899406984405885, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.4612817764282227, + "learning_rate": 1e-06, + "loss": 1.1073, + "mean_token_accuracy": 0.6768816113471985, + "num_tokens": 84454829.0, + "step": 3269 + }, + { + "epoch": 0.35910388754667255, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.484736442565918, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7235745191574097, + "num_tokens": 84475796.0, + "step": 3270 + }, + { + "epoch": 0.3592137052492862, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.2662601470947266, + "learning_rate": 1e-06, + "loss": 1.0994, + "mean_token_accuracy": 0.6732326745986938, + "num_tokens": 84506712.0, + "step": 3271 + }, + { + "epoch": 0.35932352295189984, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.6545042991638184, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7055484056472778, + "num_tokens": 84527613.0, + "step": 3272 + }, + { + "epoch": 0.3594333406545135, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.576056480407715, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6943373680114746, + "num_tokens": 84552379.0, + "step": 3273 + }, + { + "epoch": 0.3595431583571272, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.1470046043395996, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6794766783714294, + "num_tokens": 84583957.0, + "step": 3274 + }, + { + "epoch": 0.35965297605974084, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.2427570819854736, + "learning_rate": 1e-06, + "loss": 1.1063, + "mean_token_accuracy": 0.6717402935028076, + "num_tokens": 84616061.0, + "step": 3275 + }, + { + "epoch": 0.3597627937623545, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.4445173740386963, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6927264332771301, + "num_tokens": 84639966.0, + "step": 3276 + }, + { + "epoch": 0.35987261146496813, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.1904423236846924, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6990283131599426, + "num_tokens": 84670805.0, + "step": 3277 + }, + { + "epoch": 0.35998242916758183, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.4160971641540527, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6955779790878296, + "num_tokens": 84695492.0, + "step": 3278 + }, + { + "epoch": 0.3600922468701955, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.4378976821899414, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.723063051700592, + "num_tokens": 84721170.0, + "step": 3279 + }, + { + "epoch": 0.3602020645728091, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.3723130226135254, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.690385639667511, + "num_tokens": 84746589.0, + "step": 3280 + }, + { + "epoch": 0.3603118822754228, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.4660470485687256, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6943608522415161, + "num_tokens": 84770710.0, + "step": 3281 + }, + { + "epoch": 0.36042169997803647, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.047100067138672, + "learning_rate": 1e-06, + "loss": 1.1523, + "mean_token_accuracy": 0.6638959646224976, + "num_tokens": 84806101.0, + "step": 3282 + }, + { + "epoch": 0.3605315176806501, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.1934773921966553, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7095631957054138, + "num_tokens": 84835710.0, + "step": 3283 + }, + { + "epoch": 0.36064133538326376, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.371748208999634, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6921406984329224, + "num_tokens": 84863843.0, + "step": 3284 + }, + { + "epoch": 0.36075115308587746, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 2.555267810821533, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6860349178314209, + "num_tokens": 84886682.0, + "step": 3285 + }, + { + "epoch": 0.3608609707884911, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 7.293597221374512, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6937999129295349, + "num_tokens": 84907655.0, + "step": 3286 + }, + { + "epoch": 0.36097078849110475, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.241738796234131, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6924033164978027, + "num_tokens": 84936194.0, + "step": 3287 + }, + { + "epoch": 0.3610806061937184, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.2514753341674805, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6854734420776367, + "num_tokens": 84964512.0, + "step": 3288 + }, + { + "epoch": 0.3611904238963321, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.3247742652893066, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6965221166610718, + "num_tokens": 84991502.0, + "step": 3289 + }, + { + "epoch": 0.36130024159894575, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.5398035049438477, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6862713098526001, + "num_tokens": 85013129.0, + "step": 3290 + }, + { + "epoch": 0.3614100593015594, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.465822458267212, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7157919406890869, + "num_tokens": 85036620.0, + "step": 3291 + }, + { + "epoch": 0.3615198770041731, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.5254087448120117, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6927813291549683, + "num_tokens": 85060171.0, + "step": 3292 + }, + { + "epoch": 0.36162969470678674, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.359409809112549, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6846357583999634, + "num_tokens": 85086553.0, + "step": 3293 + }, + { + "epoch": 0.3617395124094004, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.126277208328247, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6917642951011658, + "num_tokens": 85118680.0, + "step": 3294 + }, + { + "epoch": 0.36184933011201403, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.47400164604187, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7116103768348694, + "num_tokens": 85142298.0, + "step": 3295 + }, + { + "epoch": 0.36195914781462774, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.2090864181518555, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7095311880111694, + "num_tokens": 85170484.0, + "step": 3296 + }, + { + "epoch": 0.3620689655172414, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.376458168029785, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7175700664520264, + "num_tokens": 85196017.0, + "step": 3297 + }, + { + "epoch": 0.362178783219855, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.58874249458313, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7236005067825317, + "num_tokens": 85216845.0, + "step": 3298 + }, + { + "epoch": 0.36228860092246873, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.2865755558013916, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7114790678024292, + "num_tokens": 85243824.0, + "step": 3299 + }, + { + "epoch": 0.3623984186250824, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.2035512924194336, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.7038292288780212, + "num_tokens": 85274198.0, + "step": 3300 + }, + { + "epoch": 0.362508236327696, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.259580373764038, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6967798471450806, + "num_tokens": 85303239.0, + "step": 3301 + }, + { + "epoch": 0.36261805403030967, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.1952590942382812, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7061952352523804, + "num_tokens": 85330757.0, + "step": 3302 + }, + { + "epoch": 0.36272787173292337, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.3783905506134033, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7322844862937927, + "num_tokens": 85354065.0, + "step": 3303 + }, + { + "epoch": 0.362837689435537, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 2.3830227851867676, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7379138469696045, + "num_tokens": 85377501.0, + "step": 3304 + }, + { + "epoch": 0.36294750713815066, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 2.3401570320129395, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6926355361938477, + "num_tokens": 85405009.0, + "step": 3305 + }, + { + "epoch": 0.3630573248407643, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 2.732133150100708, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7225534319877625, + "num_tokens": 85425905.0, + "step": 3306 + }, + { + "epoch": 0.363167142543378, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 2.3340072631835938, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.706564724445343, + "num_tokens": 85452029.0, + "step": 3307 + }, + { + "epoch": 0.36327696024599165, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 2.436394214630127, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7017489671707153, + "num_tokens": 85478947.0, + "step": 3308 + }, + { + "epoch": 0.3633867779486053, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 2.148306131362915, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6974676251411438, + "num_tokens": 85510887.0, + "step": 3309 + }, + { + "epoch": 0.363496595651219, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.265148639678955, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6923125386238098, + "num_tokens": 85539105.0, + "step": 3310 + }, + { + "epoch": 0.36360641335383265, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.634834051132202, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7158865928649902, + "num_tokens": 85561827.0, + "step": 3311 + }, + { + "epoch": 0.3637162310564463, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3642871379852295, + "learning_rate": 1e-06, + "loss": 1.1177, + "mean_token_accuracy": 0.6707205772399902, + "num_tokens": 85594193.0, + "step": 3312 + }, + { + "epoch": 0.36382604875905994, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4178502559661865, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6928896903991699, + "num_tokens": 85619805.0, + "step": 3313 + }, + { + "epoch": 0.36393586646167364, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.812500238418579, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7016751170158386, + "num_tokens": 85640275.0, + "step": 3314 + }, + { + "epoch": 0.3640456841642873, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4405646324157715, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6879482269287109, + "num_tokens": 85665126.0, + "step": 3315 + }, + { + "epoch": 0.36415550186690093, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.5252773761749268, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.7015984058380127, + "num_tokens": 85689650.0, + "step": 3316 + }, + { + "epoch": 0.3642653195695146, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.122490644454956, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6930620670318604, + "num_tokens": 85720567.0, + "step": 3317 + }, + { + "epoch": 0.3643751372721283, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4523465633392334, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6934043169021606, + "num_tokens": 85744555.0, + "step": 3318 + }, + { + "epoch": 0.3644849549747419, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.320549964904785, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6987176537513733, + "num_tokens": 85769197.0, + "step": 3319 + }, + { + "epoch": 0.36459477267735557, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.342210292816162, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7038196325302124, + "num_tokens": 85794927.0, + "step": 3320 + }, + { + "epoch": 0.3647045903799693, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.257880926132202, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.703614354133606, + "num_tokens": 85820362.0, + "step": 3321 + }, + { + "epoch": 0.3648144080825829, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.5373246669769287, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7225097417831421, + "num_tokens": 85841839.0, + "step": 3322 + }, + { + "epoch": 0.36492422578519657, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3720543384552, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6898068189620972, + "num_tokens": 85867377.0, + "step": 3323 + }, + { + "epoch": 0.3650340434878102, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.339895009994507, + "learning_rate": 1e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6813082098960876, + "num_tokens": 85894728.0, + "step": 3324 + }, + { + "epoch": 0.3651438611904239, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.296956777572632, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7130768895149231, + "num_tokens": 85920460.0, + "step": 3325 + }, + { + "epoch": 0.36525367889303756, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4155967235565186, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6877531409263611, + "num_tokens": 85947417.0, + "step": 3326 + }, + { + "epoch": 0.3653634965956512, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.2172815799713135, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6841214299201965, + "num_tokens": 85978475.0, + "step": 3327 + }, + { + "epoch": 0.3654733142982649, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.230555295944214, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6940010190010071, + "num_tokens": 86005268.0, + "step": 3328 + }, + { + "epoch": 0.36558313200087855, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.2797932624816895, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7108778357505798, + "num_tokens": 86032447.0, + "step": 3329 + }, + { + "epoch": 0.3656929497034922, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.514979124069214, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.682277262210846, + "num_tokens": 86055965.0, + "step": 3330 + }, + { + "epoch": 0.36580276740610584, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.288635015487671, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6760327816009521, + "num_tokens": 86083407.0, + "step": 3331 + }, + { + "epoch": 0.36591258510871955, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4063868522644043, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7061758637428284, + "num_tokens": 86107077.0, + "step": 3332 + }, + { + "epoch": 0.3660224028113332, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.5327370166778564, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6886361837387085, + "num_tokens": 86131381.0, + "step": 3333 + }, + { + "epoch": 0.36613222051394684, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.5194432735443115, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6906343698501587, + "num_tokens": 86154122.0, + "step": 3334 + }, + { + "epoch": 0.3662420382165605, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3453969955444336, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.692192018032074, + "num_tokens": 86180267.0, + "step": 3335 + }, + { + "epoch": 0.3663518559191742, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.0607750415802, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6777509450912476, + "num_tokens": 86215594.0, + "step": 3336 + }, + { + "epoch": 0.36646167362178783, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.160449504852295, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6907541155815125, + "num_tokens": 86246071.0, + "step": 3337 + }, + { + "epoch": 0.3665714913244015, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.218593120574951, + "learning_rate": 1e-06, + "loss": 1.1252, + "mean_token_accuracy": 0.6701806783676147, + "num_tokens": 86277975.0, + "step": 3338 + }, + { + "epoch": 0.3666813090270152, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.290668487548828, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6942312717437744, + "num_tokens": 86304784.0, + "step": 3339 + }, + { + "epoch": 0.3667911267296288, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.2128424644470215, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7102969884872437, + "num_tokens": 86334727.0, + "step": 3340 + }, + { + "epoch": 0.36690094443224247, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3885083198547363, + "learning_rate": 1e-06, + "loss": 1.11, + "mean_token_accuracy": 0.6928852796554565, + "num_tokens": 86360301.0, + "step": 3341 + }, + { + "epoch": 0.3670107621348561, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.369042158126831, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7099611163139343, + "num_tokens": 86384913.0, + "step": 3342 + }, + { + "epoch": 0.3671205798374698, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.36650013923645, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7019358277320862, + "num_tokens": 86410825.0, + "step": 3343 + }, + { + "epoch": 0.36723039754008346, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3405466079711914, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6936205625534058, + "num_tokens": 86436639.0, + "step": 3344 + }, + { + "epoch": 0.3673402152426971, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.1983728408813477, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6996721029281616, + "num_tokens": 86465164.0, + "step": 3345 + }, + { + "epoch": 0.3674500329453108, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.533146858215332, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6913543939590454, + "num_tokens": 86487983.0, + "step": 3346 + }, + { + "epoch": 0.36755985064792446, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.162726640701294, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7057439684867859, + "num_tokens": 86515243.0, + "step": 3347 + }, + { + "epoch": 0.3676696683505381, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4091737270355225, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7081925868988037, + "num_tokens": 86540172.0, + "step": 3348 + }, + { + "epoch": 0.36777948605315175, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.2209274768829346, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6804571151733398, + "num_tokens": 86571138.0, + "step": 3349 + }, + { + "epoch": 0.36788930375576545, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.077498435974121, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6844576597213745, + "num_tokens": 86604797.0, + "step": 3350 + }, + { + "epoch": 0.3679991214583791, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3919551372528076, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7106752991676331, + "num_tokens": 86628636.0, + "step": 3351 + }, + { + "epoch": 0.36810893916099274, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.044724702835083, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6928723454475403, + "num_tokens": 86663120.0, + "step": 3352 + }, + { + "epoch": 0.3682187568636064, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.305410623550415, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7177222967147827, + "num_tokens": 86689552.0, + "step": 3353 + }, + { + "epoch": 0.3683285745662201, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.5092153549194336, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6984643340110779, + "num_tokens": 86712863.0, + "step": 3354 + }, + { + "epoch": 0.36843839226883374, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4268839359283447, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6928814053535461, + "num_tokens": 86736563.0, + "step": 3355 + }, + { + "epoch": 0.3685482099714474, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.216395378112793, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.7034543752670288, + "num_tokens": 86764256.0, + "step": 3356 + }, + { + "epoch": 0.3686580276740611, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.418806552886963, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7180535793304443, + "num_tokens": 86787797.0, + "step": 3357 + }, + { + "epoch": 0.36876784537667473, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.2410888671875, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7140946984291077, + "num_tokens": 86815390.0, + "step": 3358 + }, + { + "epoch": 0.3688776630792884, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.1090261936187744, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7324682474136353, + "num_tokens": 86846028.0, + "step": 3359 + }, + { + "epoch": 0.368987480781902, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.553647518157959, + "learning_rate": 1e-06, + "loss": 1.0909, + "mean_token_accuracy": 0.6820226907730103, + "num_tokens": 86869114.0, + "step": 3360 + }, + { + "epoch": 0.3690972984845157, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.4385061264038086, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6939513683319092, + "num_tokens": 86893563.0, + "step": 3361 + }, + { + "epoch": 0.36920711618712937, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.503167152404785, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7016727924346924, + "num_tokens": 86916057.0, + "step": 3362 + }, + { + "epoch": 0.369316933889743, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3035452365875244, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6860039234161377, + "num_tokens": 86944193.0, + "step": 3363 + }, + { + "epoch": 0.36942675159235666, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.2794556617736816, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6952136754989624, + "num_tokens": 86973130.0, + "step": 3364 + }, + { + "epoch": 0.36953656929497036, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 2.3944931030273438, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.716437041759491, + "num_tokens": 86997045.0, + "step": 3365 + }, + { + "epoch": 0.369646386997584, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 7.050449371337891, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6889287233352661, + "num_tokens": 87021295.0, + "step": 3366 + }, + { + "epoch": 0.36975620470019765, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.318152666091919, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.7001625299453735, + "num_tokens": 87046413.0, + "step": 3367 + }, + { + "epoch": 0.36986602240281136, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4803731441497803, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6930981874465942, + "num_tokens": 87073882.0, + "step": 3368 + }, + { + "epoch": 0.369975840105425, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4202308654785156, + "learning_rate": 1e-06, + "loss": 1.1026, + "mean_token_accuracy": 0.683133602142334, + "num_tokens": 87100385.0, + "step": 3369 + }, + { + "epoch": 0.37008565780803865, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3961679935455322, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7138423919677734, + "num_tokens": 87126593.0, + "step": 3370 + }, + { + "epoch": 0.3701954755106523, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.0416061878204346, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6876888871192932, + "num_tokens": 87161071.0, + "step": 3371 + }, + { + "epoch": 0.370305293213266, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.311741828918457, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6962829232215881, + "num_tokens": 87186272.0, + "step": 3372 + }, + { + "epoch": 0.37041511091587964, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.2391295433044434, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7064803838729858, + "num_tokens": 87212120.0, + "step": 3373 + }, + { + "epoch": 0.3705249286184933, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.6611475944519043, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7080259919166565, + "num_tokens": 87231883.0, + "step": 3374 + }, + { + "epoch": 0.370634746321107, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.209568500518799, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6851158738136292, + "num_tokens": 87260370.0, + "step": 3375 + }, + { + "epoch": 0.37074456402372064, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3524322509765625, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6953372955322266, + "num_tokens": 87286894.0, + "step": 3376 + }, + { + "epoch": 0.3708543817263343, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.1211233139038086, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6820006966590881, + "num_tokens": 87321398.0, + "step": 3377 + }, + { + "epoch": 0.3709641994289479, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.599929094314575, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7143412828445435, + "num_tokens": 87345279.0, + "step": 3378 + }, + { + "epoch": 0.37107401713156163, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.6341822147369385, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7026705741882324, + "num_tokens": 87369573.0, + "step": 3379 + }, + { + "epoch": 0.3711838348341753, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4087748527526855, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6833606958389282, + "num_tokens": 87396176.0, + "step": 3380 + }, + { + "epoch": 0.3712936525367889, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.2491519451141357, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6945779323577881, + "num_tokens": 87426871.0, + "step": 3381 + }, + { + "epoch": 0.37140347023940257, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4864251613616943, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.7019495368003845, + "num_tokens": 87453349.0, + "step": 3382 + }, + { + "epoch": 0.37151328794201627, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 6.864969253540039, + "learning_rate": 1e-06, + "loss": 1.1282, + "mean_token_accuracy": 0.6952605247497559, + "num_tokens": 87484742.0, + "step": 3383 + }, + { + "epoch": 0.3716231056446299, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.6292052268981934, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7137566804885864, + "num_tokens": 87506704.0, + "step": 3384 + }, + { + "epoch": 0.37173292334724356, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.538302421569824, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.691974401473999, + "num_tokens": 87529839.0, + "step": 3385 + }, + { + "epoch": 0.37184274104985726, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.341603994369507, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6901083588600159, + "num_tokens": 87558485.0, + "step": 3386 + }, + { + "epoch": 0.3719525587524709, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.265045404434204, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.706063985824585, + "num_tokens": 87586352.0, + "step": 3387 + }, + { + "epoch": 0.37206237645508455, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.3757853507995605, + "learning_rate": 1e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.6880908012390137, + "num_tokens": 87614672.0, + "step": 3388 + }, + { + "epoch": 0.3721721941576982, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.239001989364624, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7131133675575256, + "num_tokens": 87643313.0, + "step": 3389 + }, + { + "epoch": 0.3722820118603119, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.2660443782806396, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6966801881790161, + "num_tokens": 87670451.0, + "step": 3390 + }, + { + "epoch": 0.37239182956292555, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.624147653579712, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6847909688949585, + "num_tokens": 87692044.0, + "step": 3391 + }, + { + "epoch": 0.3725016472655392, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.1002447605133057, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7021338939666748, + "num_tokens": 87721089.0, + "step": 3392 + }, + { + "epoch": 0.37261146496815284, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.408499240875244, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7015358209609985, + "num_tokens": 87743663.0, + "step": 3393 + }, + { + "epoch": 0.37272128267076654, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.5776584148406982, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7101811170578003, + "num_tokens": 87764986.0, + "step": 3394 + }, + { + "epoch": 0.3728311003733802, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3945651054382324, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7053967714309692, + "num_tokens": 87790380.0, + "step": 3395 + }, + { + "epoch": 0.37294091807599383, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.446032762527466, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6883435249328613, + "num_tokens": 87814012.0, + "step": 3396 + }, + { + "epoch": 0.37305073577860753, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4203124046325684, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6882251501083374, + "num_tokens": 87838307.0, + "step": 3397 + }, + { + "epoch": 0.3731605534812212, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3563692569732666, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7004141211509705, + "num_tokens": 87864332.0, + "step": 3398 + }, + { + "epoch": 0.3732703711838348, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.770120859146118, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6916040182113647, + "num_tokens": 87884903.0, + "step": 3399 + }, + { + "epoch": 0.37338018888644847, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.386218309402466, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6966286301612854, + "num_tokens": 87908543.0, + "step": 3400 + }, + { + "epoch": 0.3734900065890622, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.567168951034546, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7070351839065552, + "num_tokens": 87930022.0, + "step": 3401 + }, + { + "epoch": 0.3735998242916758, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.1418089866638184, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6915640234947205, + "num_tokens": 87961748.0, + "step": 3402 + }, + { + "epoch": 0.37370964199428947, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.042964220046997, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7063891291618347, + "num_tokens": 87995355.0, + "step": 3403 + }, + { + "epoch": 0.37381945969690317, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.0953304767608643, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6785835027694702, + "num_tokens": 88026996.0, + "step": 3404 + }, + { + "epoch": 0.3739292773995168, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3249502182006836, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7206871509552002, + "num_tokens": 88052140.0, + "step": 3405 + }, + { + "epoch": 0.37403909510213046, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.5327422618865967, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6877418756484985, + "num_tokens": 88073640.0, + "step": 3406 + }, + { + "epoch": 0.3741489128047441, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.4921419620513916, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6992520689964294, + "num_tokens": 88096965.0, + "step": 3407 + }, + { + "epoch": 0.3742587305073578, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.0889642238616943, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6905429363250732, + "num_tokens": 88128966.0, + "step": 3408 + }, + { + "epoch": 0.37436854820997145, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3841333389282227, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7108994722366333, + "num_tokens": 88152934.0, + "step": 3409 + }, + { + "epoch": 0.3744783659125851, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.599611520767212, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.682784914970398, + "num_tokens": 88175382.0, + "step": 3410 + }, + { + "epoch": 0.37458818361519874, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.3119616508483887, + "learning_rate": 1e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.6839698553085327, + "num_tokens": 88204938.0, + "step": 3411 + }, + { + "epoch": 0.37469800131781245, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.5664761066436768, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7003318071365356, + "num_tokens": 88226259.0, + "step": 3412 + }, + { + "epoch": 0.3748078190204261, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.38199782371521, + "learning_rate": 1e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.672351598739624, + "num_tokens": 88251893.0, + "step": 3413 + }, + { + "epoch": 0.37491763672303974, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.303912878036499, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7050830721855164, + "num_tokens": 88277786.0, + "step": 3414 + }, + { + "epoch": 0.37502745442565344, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.6808104515075684, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7024521827697754, + "num_tokens": 88300143.0, + "step": 3415 + }, + { + "epoch": 0.3751372721282671, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.503981590270996, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7031622529029846, + "num_tokens": 88323457.0, + "step": 3416 + }, + { + "epoch": 0.37524708983088073, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.6488254070281982, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7121045589447021, + "num_tokens": 88343643.0, + "step": 3417 + }, + { + "epoch": 0.3753569075334944, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.174382448196411, + "learning_rate": 1e-06, + "loss": 1.1081, + "mean_token_accuracy": 0.6731651425361633, + "num_tokens": 88373513.0, + "step": 3418 + }, + { + "epoch": 0.3754667252361081, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 2.2341177463531494, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6899157166481018, + "num_tokens": 88402245.0, + "step": 3419 + }, + { + "epoch": 0.3755765429387217, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.2636396884918213, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6919816732406616, + "num_tokens": 88430919.0, + "step": 3420 + }, + { + "epoch": 0.37568636064133537, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.120969772338867, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7099425792694092, + "num_tokens": 88460174.0, + "step": 3421 + }, + { + "epoch": 0.37579617834394907, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.341613292694092, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6883749961853027, + "num_tokens": 88488063.0, + "step": 3422 + }, + { + "epoch": 0.3759059960465627, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.189107894897461, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6940690875053406, + "num_tokens": 88520294.0, + "step": 3423 + }, + { + "epoch": 0.37601581374917636, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.144550085067749, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6771577000617981, + "num_tokens": 88554544.0, + "step": 3424 + }, + { + "epoch": 0.37612563145179, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.3204708099365234, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7073798179626465, + "num_tokens": 88578934.0, + "step": 3425 + }, + { + "epoch": 0.3762354491544037, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.4842987060546875, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7041008472442627, + "num_tokens": 88600495.0, + "step": 3426 + }, + { + "epoch": 0.37634526685701736, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.267878770828247, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7118823528289795, + "num_tokens": 88627034.0, + "step": 3427 + }, + { + "epoch": 0.376455084559631, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.4005160331726074, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7010796666145325, + "num_tokens": 88650799.0, + "step": 3428 + }, + { + "epoch": 0.37656490226224465, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.5736663341522217, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7057195901870728, + "num_tokens": 88670895.0, + "step": 3429 + }, + { + "epoch": 0.37667471996485835, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.4159276485443115, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6929051876068115, + "num_tokens": 88693919.0, + "step": 3430 + }, + { + "epoch": 0.376784537667472, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.3047537803649902, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.674116849899292, + "num_tokens": 88721418.0, + "step": 3431 + }, + { + "epoch": 0.37689435537008564, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.429081678390503, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7193775177001953, + "num_tokens": 88745403.0, + "step": 3432 + }, + { + "epoch": 0.37700417307269934, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.078545570373535, + "learning_rate": 1e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.6845320463180542, + "num_tokens": 88780492.0, + "step": 3433 + }, + { + "epoch": 0.377113990775313, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.181884288787842, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6868326663970947, + "num_tokens": 88811625.0, + "step": 3434 + }, + { + "epoch": 0.37722380847792664, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.1882827281951904, + "learning_rate": 1e-06, + "loss": 1.1487, + "mean_token_accuracy": 0.6767222881317139, + "num_tokens": 88843377.0, + "step": 3435 + }, + { + "epoch": 0.3773336261805403, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.295806407928467, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7028963565826416, + "num_tokens": 88871222.0, + "step": 3436 + }, + { + "epoch": 0.377443443883154, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.0083322525024414, + "learning_rate": 1e-06, + "loss": 1.1049, + "mean_token_accuracy": 0.6732615232467651, + "num_tokens": 88906003.0, + "step": 3437 + }, + { + "epoch": 0.37755326158576763, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.345289468765259, + "learning_rate": 1e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6826928853988647, + "num_tokens": 88934175.0, + "step": 3438 + }, + { + "epoch": 0.3776630792883813, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.083303213119507, + "learning_rate": 1e-06, + "loss": 1.1112, + "mean_token_accuracy": 0.6753508448600769, + "num_tokens": 88967305.0, + "step": 3439 + }, + { + "epoch": 0.3777728969909949, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.338405132293701, + "learning_rate": 1e-06, + "loss": 1.1153, + "mean_token_accuracy": 0.6718921661376953, + "num_tokens": 88995058.0, + "step": 3440 + }, + { + "epoch": 0.3778827146936086, + "ewc_loss": 9.5367431640625e-06, + "grad_norm": 2.770677089691162, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6866152286529541, + "num_tokens": 89016825.0, + "step": 3441 + }, + { + "epoch": 0.37799253239622227, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.3543474674224854, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6981010437011719, + "num_tokens": 89044034.0, + "step": 3442 + }, + { + "epoch": 0.3781023500988359, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.4382855892181396, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6994608640670776, + "num_tokens": 89068052.0, + "step": 3443 + }, + { + "epoch": 0.3782121678014496, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.30899715423584, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7124475240707397, + "num_tokens": 89094888.0, + "step": 3444 + }, + { + "epoch": 0.37832198550406326, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.306856870651245, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7006120085716248, + "num_tokens": 89120885.0, + "step": 3445 + }, + { + "epoch": 0.3784318032066769, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.265415668487549, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7145898342132568, + "num_tokens": 89148606.0, + "step": 3446 + }, + { + "epoch": 0.37854162090929055, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.623737096786499, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7237731218338013, + "num_tokens": 89170649.0, + "step": 3447 + }, + { + "epoch": 0.37865143861190426, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.3833107948303223, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6897644400596619, + "num_tokens": 89196599.0, + "step": 3448 + }, + { + "epoch": 0.3787612563145179, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.5393264293670654, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7080703973770142, + "num_tokens": 89218130.0, + "step": 3449 + }, + { + "epoch": 0.37887107401713155, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.424490451812744, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6943328380584717, + "num_tokens": 89242478.0, + "step": 3450 + }, + { + "epoch": 0.37898089171974525, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.3705008029937744, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6933011412620544, + "num_tokens": 89267931.0, + "step": 3451 + }, + { + "epoch": 0.3790907094223589, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.116398811340332, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7212038636207581, + "num_tokens": 89298706.0, + "step": 3452 + }, + { + "epoch": 0.37920052712497254, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 2.3809165954589844, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.708407998085022, + "num_tokens": 89325375.0, + "step": 3453 + }, + { + "epoch": 0.3793103448275862, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.5917892456054688, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7090399265289307, + "num_tokens": 89346526.0, + "step": 3454 + }, + { + "epoch": 0.3794201625301999, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.4167609214782715, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6916174292564392, + "num_tokens": 89372830.0, + "step": 3455 + }, + { + "epoch": 0.37952998023281354, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.575745105743408, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6983036994934082, + "num_tokens": 89396497.0, + "step": 3456 + }, + { + "epoch": 0.3796397979354272, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.397432327270508, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7067484855651855, + "num_tokens": 89422444.0, + "step": 3457 + }, + { + "epoch": 0.3797496156380408, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.427855968475342, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6822237968444824, + "num_tokens": 89447722.0, + "step": 3458 + }, + { + "epoch": 0.37985943334065453, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.3040289878845215, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7226474285125732, + "num_tokens": 89472695.0, + "step": 3459 + }, + { + "epoch": 0.3799692510432682, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.162153959274292, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6920251846313477, + "num_tokens": 89504156.0, + "step": 3460 + }, + { + "epoch": 0.3800790687458818, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 2.1613926887512207, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.691095769405365, + "num_tokens": 89533887.0, + "step": 3461 + }, + { + "epoch": 0.3801888864484955, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.5212783813476562, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7029093503952026, + "num_tokens": 89558518.0, + "step": 3462 + }, + { + "epoch": 0.38029870415110917, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.664672613143921, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6892762184143066, + "num_tokens": 89579890.0, + "step": 3463 + }, + { + "epoch": 0.3804085218537228, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4278948307037354, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7031058073043823, + "num_tokens": 89604732.0, + "step": 3464 + }, + { + "epoch": 0.38051833955633646, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.8128676414489746, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7097854614257812, + "num_tokens": 89625793.0, + "step": 3465 + }, + { + "epoch": 0.38062815725895016, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.440730094909668, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7130858302116394, + "num_tokens": 89648859.0, + "step": 3466 + }, + { + "epoch": 0.3807379749615638, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.204066753387451, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6970779895782471, + "num_tokens": 89679663.0, + "step": 3467 + }, + { + "epoch": 0.38084779266417745, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.3209667205810547, + "learning_rate": 1e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.6805445551872253, + "num_tokens": 89705345.0, + "step": 3468 + }, + { + "epoch": 0.3809576103667911, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4181604385375977, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6915316581726074, + "num_tokens": 89730700.0, + "step": 3469 + }, + { + "epoch": 0.3810674280694048, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.120948076248169, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7110608816146851, + "num_tokens": 89763152.0, + "step": 3470 + }, + { + "epoch": 0.38117724577201845, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4251608848571777, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6895170211791992, + "num_tokens": 89786840.0, + "step": 3471 + }, + { + "epoch": 0.3812870634746321, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4196696281433105, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.712982177734375, + "num_tokens": 89811531.0, + "step": 3472 + }, + { + "epoch": 0.3813968811772458, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.417598009109497, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6935431361198425, + "num_tokens": 89837392.0, + "step": 3473 + }, + { + "epoch": 0.38150669887985944, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.458432674407959, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7121783494949341, + "num_tokens": 89861657.0, + "step": 3474 + }, + { + "epoch": 0.3816165165824731, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4339663982391357, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6992267966270447, + "num_tokens": 89888089.0, + "step": 3475 + }, + { + "epoch": 0.38172633428508673, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.0352938175201416, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6991323232650757, + "num_tokens": 89922136.0, + "step": 3476 + }, + { + "epoch": 0.38183615198770043, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.325824499130249, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7059404850006104, + "num_tokens": 89946651.0, + "step": 3477 + }, + { + "epoch": 0.3819459696903141, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.5451507568359375, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7030419111251831, + "num_tokens": 89970358.0, + "step": 3478 + }, + { + "epoch": 0.3820557873929277, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.5303027629852295, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6886114478111267, + "num_tokens": 89994077.0, + "step": 3479 + }, + { + "epoch": 0.3821656050955414, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.58544659614563, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6858354806900024, + "num_tokens": 90017250.0, + "step": 3480 + }, + { + "epoch": 0.3822754227981551, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.256054162979126, + "learning_rate": 1e-06, + "loss": 1.09, + "mean_token_accuracy": 0.6805671453475952, + "num_tokens": 90046866.0, + "step": 3481 + }, + { + "epoch": 0.3823852405007687, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.5400466918945312, + "learning_rate": 1e-06, + "loss": 1.1191, + "mean_token_accuracy": 0.6790107488632202, + "num_tokens": 90070820.0, + "step": 3482 + }, + { + "epoch": 0.38249505820338237, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.2710421085357666, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6830868721008301, + "num_tokens": 90103266.0, + "step": 3483 + }, + { + "epoch": 0.38260487590599607, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.5332558155059814, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7061401605606079, + "num_tokens": 90128277.0, + "step": 3484 + }, + { + "epoch": 0.3827146936086097, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.2484288215637207, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.691321611404419, + "num_tokens": 90158627.0, + "step": 3485 + }, + { + "epoch": 0.38282451131122336, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.3086323738098145, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6770933270454407, + "num_tokens": 90188177.0, + "step": 3486 + }, + { + "epoch": 0.382934329013837, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.146329402923584, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6882308721542358, + "num_tokens": 90219370.0, + "step": 3487 + }, + { + "epoch": 0.3830441467164507, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.586608409881592, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7262668609619141, + "num_tokens": 90239794.0, + "step": 3488 + }, + { + "epoch": 0.38315396441906435, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.1889467239379883, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6786632537841797, + "num_tokens": 90270780.0, + "step": 3489 + }, + { + "epoch": 0.383263782121678, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.3184316158294678, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7158018350601196, + "num_tokens": 90299170.0, + "step": 3490 + }, + { + "epoch": 0.3833735998242917, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.220036745071411, + "learning_rate": 1e-06, + "loss": 1.1178, + "mean_token_accuracy": 0.6746291518211365, + "num_tokens": 90330360.0, + "step": 3491 + }, + { + "epoch": 0.38348341752690535, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.7175676822662354, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7060264348983765, + "num_tokens": 90350344.0, + "step": 3492 + }, + { + "epoch": 0.383593235229519, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.508986473083496, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7174865007400513, + "num_tokens": 90374020.0, + "step": 3493 + }, + { + "epoch": 0.38370305293213264, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.461620569229126, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7032711505889893, + "num_tokens": 90398968.0, + "step": 3494 + }, + { + "epoch": 0.38381287063474634, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4827754497528076, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.733165979385376, + "num_tokens": 90424099.0, + "step": 3495 + }, + { + "epoch": 0.38392268833736, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4888131618499756, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.694849967956543, + "num_tokens": 90447219.0, + "step": 3496 + }, + { + "epoch": 0.38403250603997363, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.2263190746307373, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.707309365272522, + "num_tokens": 90475098.0, + "step": 3497 + }, + { + "epoch": 0.38414232374258733, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.70855712890625, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7344351410865784, + "num_tokens": 90493385.0, + "step": 3498 + }, + { + "epoch": 0.384252141445201, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.3112661838531494, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7023431062698364, + "num_tokens": 90524253.0, + "step": 3499 + }, + { + "epoch": 0.3843619591478146, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.235236406326294, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6922792196273804, + "num_tokens": 90554973.0, + "step": 3500 + }, + { + "epoch": 0.38447177685042827, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.187997579574585, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.7018361687660217, + "num_tokens": 90584515.0, + "step": 3501 + }, + { + "epoch": 0.38458159455304197, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4715356826782227, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6971142888069153, + "num_tokens": 90608481.0, + "step": 3502 + }, + { + "epoch": 0.3846914122556556, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.3498787879943848, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6933849453926086, + "num_tokens": 90633367.0, + "step": 3503 + }, + { + "epoch": 0.38480122995826926, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4835398197174072, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6994955539703369, + "num_tokens": 90657309.0, + "step": 3504 + }, + { + "epoch": 0.3849110476608829, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.4437739849090576, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6994692087173462, + "num_tokens": 90681447.0, + "step": 3505 + }, + { + "epoch": 0.3850208653634966, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.469712257385254, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.686908483505249, + "num_tokens": 90705941.0, + "step": 3506 + }, + { + "epoch": 0.38513068306611026, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.185483694076538, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7119238376617432, + "num_tokens": 90734216.0, + "step": 3507 + }, + { + "epoch": 0.3852405007687239, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.261211633682251, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.704190731048584, + "num_tokens": 90761487.0, + "step": 3508 + }, + { + "epoch": 0.3853503184713376, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.260798692703247, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6901620030403137, + "num_tokens": 90789622.0, + "step": 3509 + }, + { + "epoch": 0.38546013617395125, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.17765474319458, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6914840936660767, + "num_tokens": 90819427.0, + "step": 3510 + }, + { + "epoch": 0.3855699538765649, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.346170425415039, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6856751441955566, + "num_tokens": 90846329.0, + "step": 3511 + }, + { + "epoch": 0.38567977157917854, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.308565616607666, + "learning_rate": 1e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.6687970757484436, + "num_tokens": 90875571.0, + "step": 3512 + }, + { + "epoch": 0.38578958928179224, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.732356309890747, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7308818101882935, + "num_tokens": 90895193.0, + "step": 3513 + }, + { + "epoch": 0.3858994069844059, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.154580593109131, + "learning_rate": 1e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.6757632493972778, + "num_tokens": 90926516.0, + "step": 3514 + }, + { + "epoch": 0.38600922468701954, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.351205587387085, + "learning_rate": 1e-06, + "loss": 1.0871, + "mean_token_accuracy": 0.6763821840286255, + "num_tokens": 90955106.0, + "step": 3515 + }, + { + "epoch": 0.3861190423896332, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.394517421722412, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.7014190554618835, + "num_tokens": 90980991.0, + "step": 3516 + }, + { + "epoch": 0.3862288600922469, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.5575828552246094, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6966339349746704, + "num_tokens": 91004316.0, + "step": 3517 + }, + { + "epoch": 0.38633867779486053, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.3445677757263184, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7241857647895813, + "num_tokens": 91027761.0, + "step": 3518 + }, + { + "epoch": 0.3864484954974742, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 2.545358896255493, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6843923330307007, + "num_tokens": 91050506.0, + "step": 3519 + }, + { + "epoch": 0.3865583132000879, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.167079210281372, + "learning_rate": 1e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.6769826412200928, + "num_tokens": 91082101.0, + "step": 3520 + }, + { + "epoch": 0.3866681309027015, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.573138952255249, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7176571488380432, + "num_tokens": 91104010.0, + "step": 3521 + }, + { + "epoch": 0.38677794860531517, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.324338674545288, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.7042708992958069, + "num_tokens": 91129990.0, + "step": 3522 + }, + { + "epoch": 0.3868877663079288, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.291722297668457, + "learning_rate": 1e-06, + "loss": 1.1201, + "mean_token_accuracy": 0.6731685400009155, + "num_tokens": 91158919.0, + "step": 3523 + }, + { + "epoch": 0.3869975840105425, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.483654499053955, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7047363519668579, + "num_tokens": 91180887.0, + "step": 3524 + }, + { + "epoch": 0.38710740171315616, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.182931900024414, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6901100873947144, + "num_tokens": 91211256.0, + "step": 3525 + }, + { + "epoch": 0.3872172194157698, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.3753981590270996, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6989284753799438, + "num_tokens": 91235786.0, + "step": 3526 + }, + { + "epoch": 0.3873270371183835, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 2.4957950115203857, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6979137063026428, + "num_tokens": 91259530.0, + "step": 3527 + }, + { + "epoch": 0.38743685482099716, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.496870994567871, + "learning_rate": 1e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6820533275604248, + "num_tokens": 91284855.0, + "step": 3528 + }, + { + "epoch": 0.3875466725236108, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.3026864528656006, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7222319841384888, + "num_tokens": 91311673.0, + "step": 3529 + }, + { + "epoch": 0.38765649022622445, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.439366579055786, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.693121075630188, + "num_tokens": 91337540.0, + "step": 3530 + }, + { + "epoch": 0.38776630792883815, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.3553292751312256, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6936008930206299, + "num_tokens": 91363407.0, + "step": 3531 + }, + { + "epoch": 0.3878761256314518, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.749037027359009, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6999616622924805, + "num_tokens": 91382935.0, + "step": 3532 + }, + { + "epoch": 0.38798594333406544, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.4375979900360107, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6968957185745239, + "num_tokens": 91407494.0, + "step": 3533 + }, + { + "epoch": 0.3880957610366791, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.2509167194366455, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7183969616889954, + "num_tokens": 91434532.0, + "step": 3534 + }, + { + "epoch": 0.3882055787392928, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.765493631362915, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7285435199737549, + "num_tokens": 91453366.0, + "step": 3535 + }, + { + "epoch": 0.38831539644190644, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.283580780029297, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6966091394424438, + "num_tokens": 91479978.0, + "step": 3536 + }, + { + "epoch": 0.3884252141445201, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.1051442623138428, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6915150880813599, + "num_tokens": 91512790.0, + "step": 3537 + }, + { + "epoch": 0.3885350318471338, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.3243114948272705, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6978142857551575, + "num_tokens": 91539101.0, + "step": 3538 + }, + { + "epoch": 0.38864484954974743, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.6509926319122314, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.69427490234375, + "num_tokens": 91561374.0, + "step": 3539 + }, + { + "epoch": 0.3887546672523611, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.1244306564331055, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7056550979614258, + "num_tokens": 91589580.0, + "step": 3540 + }, + { + "epoch": 0.3888644849549747, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.4553115367889404, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6906507015228271, + "num_tokens": 91612399.0, + "step": 3541 + }, + { + "epoch": 0.3889743026575884, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.1669986248016357, + "learning_rate": 1e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6790241599082947, + "num_tokens": 91644091.0, + "step": 3542 + }, + { + "epoch": 0.38908412036020207, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.222611427307129, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6995319724082947, + "num_tokens": 91670530.0, + "step": 3543 + }, + { + "epoch": 0.3891939380628157, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.558938980102539, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7056825160980225, + "num_tokens": 91694669.0, + "step": 3544 + }, + { + "epoch": 0.38930375576542936, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.0753748416900635, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.684922456741333, + "num_tokens": 91728597.0, + "step": 3545 + }, + { + "epoch": 0.38941357346804306, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.16031813621521, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6962600350379944, + "num_tokens": 91756801.0, + "step": 3546 + }, + { + "epoch": 0.3895233911706567, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.6624886989593506, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6842527389526367, + "num_tokens": 91778500.0, + "step": 3547 + }, + { + "epoch": 0.38963320887327035, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.349916458129883, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.695014476776123, + "num_tokens": 91804944.0, + "step": 3548 + }, + { + "epoch": 0.38974302657588406, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.2126433849334717, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7272272109985352, + "num_tokens": 91833600.0, + "step": 3549 + }, + { + "epoch": 0.3898528442784977, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.327216148376465, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.727981448173523, + "num_tokens": 91856897.0, + "step": 3550 + }, + { + "epoch": 0.38996266198111135, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.481480360031128, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.696267306804657, + "num_tokens": 91878769.0, + "step": 3551 + }, + { + "epoch": 0.390072479683725, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.5144193172454834, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7088996171951294, + "num_tokens": 91901129.0, + "step": 3552 + }, + { + "epoch": 0.3901822973863387, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.6228373050689697, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7258138656616211, + "num_tokens": 91921789.0, + "step": 3553 + }, + { + "epoch": 0.39029211508895234, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.303539991378784, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6904129385948181, + "num_tokens": 91949252.0, + "step": 3554 + }, + { + "epoch": 0.390401932791566, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.5158538818359375, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7152027487754822, + "num_tokens": 91970893.0, + "step": 3555 + }, + { + "epoch": 0.3905117504941797, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 2.2608182430267334, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6815191507339478, + "num_tokens": 92003459.0, + "step": 3556 + }, + { + "epoch": 0.39062156819679333, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.3698596954345703, + "learning_rate": 1e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.6828887462615967, + "num_tokens": 92032949.0, + "step": 3557 + }, + { + "epoch": 0.390731385899407, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.3867428302764893, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6980057954788208, + "num_tokens": 92058753.0, + "step": 3558 + }, + { + "epoch": 0.3908412036020206, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.4218099117279053, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6733196973800659, + "num_tokens": 92086817.0, + "step": 3559 + }, + { + "epoch": 0.3909510213046343, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.2934763431549072, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.6902787685394287, + "num_tokens": 92114863.0, + "step": 3560 + }, + { + "epoch": 0.391060839007248, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.2635622024536133, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6911355257034302, + "num_tokens": 92143174.0, + "step": 3561 + }, + { + "epoch": 0.3911706567098616, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 2.3492555618286133, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7045714855194092, + "num_tokens": 92169271.0, + "step": 3562 + }, + { + "epoch": 0.39128047441247527, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.1881144046783447, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.69855797290802, + "num_tokens": 92197530.0, + "step": 3563 + }, + { + "epoch": 0.39139029211508897, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 2.2081615924835205, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6666555404663086, + "num_tokens": 92226522.0, + "step": 3564 + }, + { + "epoch": 0.3915001098177026, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 2.2019803524017334, + "learning_rate": 1e-06, + "loss": 1.072, + "mean_token_accuracy": 0.6878744959831238, + "num_tokens": 92255728.0, + "step": 3565 + }, + { + "epoch": 0.39160992752031626, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 2.0195233821868896, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7067886590957642, + "num_tokens": 92288415.0, + "step": 3566 + }, + { + "epoch": 0.39171974522292996, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.415487289428711, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7056742310523987, + "num_tokens": 92310582.0, + "step": 3567 + }, + { + "epoch": 0.3918295629255436, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3851511478424072, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6895676851272583, + "num_tokens": 92337192.0, + "step": 3568 + }, + { + "epoch": 0.39193938062815725, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 2.3691256046295166, + "learning_rate": 1e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.6750409007072449, + "num_tokens": 92366075.0, + "step": 3569 + }, + { + "epoch": 0.3920491983307709, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.473865032196045, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6933972835540771, + "num_tokens": 92391433.0, + "step": 3570 + }, + { + "epoch": 0.3921590160333846, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3803980350494385, + "learning_rate": 1e-06, + "loss": 1.1262, + "mean_token_accuracy": 0.6731628775596619, + "num_tokens": 92419463.0, + "step": 3571 + }, + { + "epoch": 0.39226883373599825, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.5595247745513916, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.702930212020874, + "num_tokens": 92441456.0, + "step": 3572 + }, + { + "epoch": 0.3923786514386119, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.240957736968994, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7198967933654785, + "num_tokens": 92466494.0, + "step": 3573 + }, + { + "epoch": 0.3924884691412256, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3630104064941406, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6844425201416016, + "num_tokens": 92491099.0, + "step": 3574 + }, + { + "epoch": 0.39259828684383924, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.4595866203308105, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7214568853378296, + "num_tokens": 92518006.0, + "step": 3575 + }, + { + "epoch": 0.3927081045464529, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3077023029327393, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7049621939659119, + "num_tokens": 92542700.0, + "step": 3576 + }, + { + "epoch": 0.39281792224906653, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.427433967590332, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6932823657989502, + "num_tokens": 92566279.0, + "step": 3577 + }, + { + "epoch": 0.39292773995168023, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.164252519607544, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6855113506317139, + "num_tokens": 92599121.0, + "step": 3578 + }, + { + "epoch": 0.3930375576542939, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.4745070934295654, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6992554664611816, + "num_tokens": 92624001.0, + "step": 3579 + }, + { + "epoch": 0.3931473753569075, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.2174510955810547, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7003166079521179, + "num_tokens": 92652425.0, + "step": 3580 + }, + { + "epoch": 0.39325719305952117, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.509903907775879, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6963328123092651, + "num_tokens": 92674724.0, + "step": 3581 + }, + { + "epoch": 0.39336701076213487, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.4065916538238525, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6942716836929321, + "num_tokens": 92699667.0, + "step": 3582 + }, + { + "epoch": 0.3934768284647485, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3135249614715576, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.687530517578125, + "num_tokens": 92727517.0, + "step": 3583 + }, + { + "epoch": 0.39358664616736216, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.6979100704193115, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7089758515357971, + "num_tokens": 92747373.0, + "step": 3584 + }, + { + "epoch": 0.39369646386997587, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.268277645111084, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.7064298391342163, + "num_tokens": 92775832.0, + "step": 3585 + }, + { + "epoch": 0.3938062815725895, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.0360076427459717, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6963483095169067, + "num_tokens": 92807378.0, + "step": 3586 + }, + { + "epoch": 0.39391609927520316, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.415268659591675, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7039063572883606, + "num_tokens": 92832155.0, + "step": 3587 + }, + { + "epoch": 0.3940259169778168, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3800876140594482, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7027766704559326, + "num_tokens": 92856290.0, + "step": 3588 + }, + { + "epoch": 0.3941357346804305, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3701887130737305, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6943846940994263, + "num_tokens": 92881250.0, + "step": 3589 + }, + { + "epoch": 0.39424555238304415, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.4860925674438477, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6919101476669312, + "num_tokens": 92904024.0, + "step": 3590 + }, + { + "epoch": 0.3943553700856578, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3085267543792725, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7008994221687317, + "num_tokens": 92932387.0, + "step": 3591 + }, + { + "epoch": 0.39446518778827144, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.3378002643585205, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6822571754455566, + "num_tokens": 92959209.0, + "step": 3592 + }, + { + "epoch": 0.39457500549088514, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.918726682662964, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7158730626106262, + "num_tokens": 92977796.0, + "step": 3593 + }, + { + "epoch": 0.3946848231934988, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 2.4476709365844727, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7170412540435791, + "num_tokens": 93002954.0, + "step": 3594 + }, + { + "epoch": 0.39479464089611244, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.171828269958496, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6885799169540405, + "num_tokens": 93033062.0, + "step": 3595 + }, + { + "epoch": 0.39490445859872614, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.3804287910461426, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6910163164138794, + "num_tokens": 93059527.0, + "step": 3596 + }, + { + "epoch": 0.3950142763013398, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.347266912460327, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7153554558753967, + "num_tokens": 93085994.0, + "step": 3597 + }, + { + "epoch": 0.39512409400395343, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.333789348602295, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.691379964351654, + "num_tokens": 93110793.0, + "step": 3598 + }, + { + "epoch": 0.3952339117065671, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.3329901695251465, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7111049890518188, + "num_tokens": 93137040.0, + "step": 3599 + }, + { + "epoch": 0.3953437294091808, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.5017690658569336, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7011741399765015, + "num_tokens": 93160651.0, + "step": 3600 + }, + { + "epoch": 0.3954535471117944, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.224074125289917, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7074848413467407, + "num_tokens": 93189203.0, + "step": 3601 + }, + { + "epoch": 0.39556336481440807, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.362093210220337, + "learning_rate": 1e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6822673678398132, + "num_tokens": 93215152.0, + "step": 3602 + }, + { + "epoch": 0.39567318251702177, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.197906017303467, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7082628011703491, + "num_tokens": 93242481.0, + "step": 3603 + }, + { + "epoch": 0.3957830002196354, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.084916830062866, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6781412959098816, + "num_tokens": 93273852.0, + "step": 3604 + }, + { + "epoch": 0.39589281792224906, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.4179179668426514, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6986900568008423, + "num_tokens": 93297600.0, + "step": 3605 + }, + { + "epoch": 0.3960026356248627, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.359766960144043, + "learning_rate": 1e-06, + "loss": 1.1329, + "mean_token_accuracy": 0.670063316822052, + "num_tokens": 93326205.0, + "step": 3606 + }, + { + "epoch": 0.3961124533274764, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.239013910293579, + "learning_rate": 1e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.6739540100097656, + "num_tokens": 93356719.0, + "step": 3607 + }, + { + "epoch": 0.39622227103009006, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.50626802444458, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7192023992538452, + "num_tokens": 93379681.0, + "step": 3608 + }, + { + "epoch": 0.3963320887327037, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.2583038806915283, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6917548179626465, + "num_tokens": 93407884.0, + "step": 3609 + }, + { + "epoch": 0.39644190643531735, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.2777259349823, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6977034211158752, + "num_tokens": 93434797.0, + "step": 3610 + }, + { + "epoch": 0.39655172413793105, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.2288978099823, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.699222981929779, + "num_tokens": 93461110.0, + "step": 3611 + }, + { + "epoch": 0.3966615418405447, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.177532434463501, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6927910447120667, + "num_tokens": 93493577.0, + "step": 3612 + }, + { + "epoch": 0.39677135954315834, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.542341709136963, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7084063291549683, + "num_tokens": 93516458.0, + "step": 3613 + }, + { + "epoch": 0.39688117724577204, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.3008787631988525, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7101506590843201, + "num_tokens": 93543958.0, + "step": 3614 + }, + { + "epoch": 0.3969909949483857, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 3.091181516647339, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7076252698898315, + "num_tokens": 93560899.0, + "step": 3615 + }, + { + "epoch": 0.39710081265099934, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.7141873836517334, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7143949866294861, + "num_tokens": 93579658.0, + "step": 3616 + }, + { + "epoch": 0.397210630353613, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.21104097366333, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7421960234642029, + "num_tokens": 93605222.0, + "step": 3617 + }, + { + "epoch": 0.3973204480562267, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.3001081943511963, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7036790251731873, + "num_tokens": 93631550.0, + "step": 3618 + }, + { + "epoch": 0.39743026575884033, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.1011414527893066, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.690284252166748, + "num_tokens": 93661110.0, + "step": 3619 + }, + { + "epoch": 0.397540083461454, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.8196470737457275, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7018972039222717, + "num_tokens": 93680548.0, + "step": 3620 + }, + { + "epoch": 0.3976499011640676, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.3156466484069824, + "learning_rate": 1e-06, + "loss": 1.1815, + "mean_token_accuracy": 0.6578966975212097, + "num_tokens": 93710057.0, + "step": 3621 + }, + { + "epoch": 0.3977597188666813, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.1497671604156494, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6914239525794983, + "num_tokens": 93742099.0, + "step": 3622 + }, + { + "epoch": 0.39786953656929497, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.264909029006958, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7228145003318787, + "num_tokens": 93767255.0, + "step": 3623 + }, + { + "epoch": 0.3979793542719086, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.68532657623291, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7055934071540833, + "num_tokens": 93787179.0, + "step": 3624 + }, + { + "epoch": 0.3980891719745223, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.090750217437744, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7173281908035278, + "num_tokens": 93818656.0, + "step": 3625 + }, + { + "epoch": 0.39819898967713596, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 2.3530218601226807, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.697408139705658, + "num_tokens": 93842689.0, + "step": 3626 + }, + { + "epoch": 0.3983088073797496, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.462083101272583, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7006134390830994, + "num_tokens": 93866051.0, + "step": 3627 + }, + { + "epoch": 0.39841862508236325, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.1064631938934326, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7098869681358337, + "num_tokens": 93894339.0, + "step": 3628 + }, + { + "epoch": 0.39852844278497696, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.641080379486084, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6869716644287109, + "num_tokens": 93917128.0, + "step": 3629 + }, + { + "epoch": 0.3986382604875906, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.602024555206299, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7167938947677612, + "num_tokens": 93937068.0, + "step": 3630 + }, + { + "epoch": 0.39874807819020425, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 2.499558210372925, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7068144083023071, + "num_tokens": 93961825.0, + "step": 3631 + }, + { + "epoch": 0.39885789589281795, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.5616579055786133, + "learning_rate": 1e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6730912923812866, + "num_tokens": 93986820.0, + "step": 3632 + }, + { + "epoch": 0.3989677135954316, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.569803237915039, + "learning_rate": 1e-06, + "loss": 1.0902, + "mean_token_accuracy": 0.682004451751709, + "num_tokens": 94009183.0, + "step": 3633 + }, + { + "epoch": 0.39907753129804524, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.4178504943847656, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7062150239944458, + "num_tokens": 94035169.0, + "step": 3634 + }, + { + "epoch": 0.3991873490006589, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.336850881576538, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.698962390422821, + "num_tokens": 94058915.0, + "step": 3635 + }, + { + "epoch": 0.3992971667032726, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.2527804374694824, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7040445804595947, + "num_tokens": 94087673.0, + "step": 3636 + }, + { + "epoch": 0.39940698440588623, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.1649017333984375, + "learning_rate": 1e-06, + "loss": 1.1026, + "mean_token_accuracy": 0.679268479347229, + "num_tokens": 94117111.0, + "step": 3637 + }, + { + "epoch": 0.3995168021084999, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.4847426414489746, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7116429805755615, + "num_tokens": 94142061.0, + "step": 3638 + }, + { + "epoch": 0.3996266198111135, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.378932476043701, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7243388891220093, + "num_tokens": 94166473.0, + "step": 3639 + }, + { + "epoch": 0.3997364375137272, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.1427695751190186, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.72099769115448, + "num_tokens": 94195339.0, + "step": 3640 + }, + { + "epoch": 0.3998462552163409, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.311591625213623, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7044291496276855, + "num_tokens": 94220333.0, + "step": 3641 + }, + { + "epoch": 0.3999560729189545, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.151001214981079, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7007384300231934, + "num_tokens": 94247191.0, + "step": 3642 + }, + { + "epoch": 0.4000658906215682, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.3851072788238525, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.678872287273407, + "num_tokens": 94272074.0, + "step": 3643 + }, + { + "epoch": 0.40017570832418187, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.548658847808838, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7228683233261108, + "num_tokens": 94292812.0, + "step": 3644 + }, + { + "epoch": 0.4002855260267955, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.5278964042663574, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.691581130027771, + "num_tokens": 94317623.0, + "step": 3645 + }, + { + "epoch": 0.40039534372940916, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.158283233642578, + "learning_rate": 1e-06, + "loss": 1.1075, + "mean_token_accuracy": 0.6920824646949768, + "num_tokens": 94348024.0, + "step": 3646 + }, + { + "epoch": 0.40050516143202286, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.526334285736084, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6949094533920288, + "num_tokens": 94370827.0, + "step": 3647 + }, + { + "epoch": 0.4006149791346365, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.507730484008789, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6845229268074036, + "num_tokens": 94395141.0, + "step": 3648 + }, + { + "epoch": 0.40072479683725015, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.052281379699707, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6868746876716614, + "num_tokens": 94427195.0, + "step": 3649 + }, + { + "epoch": 0.4008346145398638, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.2297849655151367, + "learning_rate": 1e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6830261945724487, + "num_tokens": 94456986.0, + "step": 3650 + }, + { + "epoch": 0.4009444322424775, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.439840078353882, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6911494135856628, + "num_tokens": 94486412.0, + "step": 3651 + }, + { + "epoch": 0.40105424994509115, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.331596612930298, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.699715256690979, + "num_tokens": 94513341.0, + "step": 3652 + }, + { + "epoch": 0.4011640676477048, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.4094436168670654, + "learning_rate": 1e-06, + "loss": 1.0996, + "mean_token_accuracy": 0.6912669539451599, + "num_tokens": 94539325.0, + "step": 3653 + }, + { + "epoch": 0.4012738853503185, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.306854724884033, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6901547312736511, + "num_tokens": 94569341.0, + "step": 3654 + }, + { + "epoch": 0.40138370305293214, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.5420336723327637, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7031499147415161, + "num_tokens": 94590125.0, + "step": 3655 + }, + { + "epoch": 0.4014935207555458, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.334559202194214, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6844218969345093, + "num_tokens": 94618712.0, + "step": 3656 + }, + { + "epoch": 0.40160333845815943, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.463282346725464, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7200186252593994, + "num_tokens": 94639634.0, + "step": 3657 + }, + { + "epoch": 0.40171315616077313, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.4523656368255615, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.681501567363739, + "num_tokens": 94663874.0, + "step": 3658 + }, + { + "epoch": 0.4018229738633868, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.3309972286224365, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6852185130119324, + "num_tokens": 94692854.0, + "step": 3659 + }, + { + "epoch": 0.4019327915660004, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.786294460296631, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.707167387008667, + "num_tokens": 94711951.0, + "step": 3660 + }, + { + "epoch": 0.4020426092686141, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.392775535583496, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6956393122673035, + "num_tokens": 94738139.0, + "step": 3661 + }, + { + "epoch": 0.40215242697122777, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.1982297897338867, + "learning_rate": 1e-06, + "loss": 1.142, + "mean_token_accuracy": 0.668516218662262, + "num_tokens": 94769276.0, + "step": 3662 + }, + { + "epoch": 0.4022622446738414, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 2.4500246047973633, + "learning_rate": 1e-06, + "loss": 1.1349, + "mean_token_accuracy": 0.6704920530319214, + "num_tokens": 94793987.0, + "step": 3663 + }, + { + "epoch": 0.40237206237645506, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.555675506591797, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7043882608413696, + "num_tokens": 94815453.0, + "step": 3664 + }, + { + "epoch": 0.40248188007906877, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.1860554218292236, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.715217113494873, + "num_tokens": 94842720.0, + "step": 3665 + }, + { + "epoch": 0.4025916977816824, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.4494168758392334, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6876654624938965, + "num_tokens": 94869987.0, + "step": 3666 + }, + { + "epoch": 0.40270151548429606, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.75410532951355, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6995244026184082, + "num_tokens": 94891386.0, + "step": 3667 + }, + { + "epoch": 0.4028113331869097, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.403452157974243, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6961470246315002, + "num_tokens": 94915405.0, + "step": 3668 + }, + { + "epoch": 0.4029211508895234, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.347442388534546, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.697737991809845, + "num_tokens": 94940529.0, + "step": 3669 + }, + { + "epoch": 0.40303096859213705, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.4257054328918457, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7026152610778809, + "num_tokens": 94963891.0, + "step": 3670 + }, + { + "epoch": 0.4031407862947507, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.4399075508117676, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6952148079872131, + "num_tokens": 94986480.0, + "step": 3671 + }, + { + "epoch": 0.4032506039973644, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.346029758453369, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6812059879302979, + "num_tokens": 95012838.0, + "step": 3672 + }, + { + "epoch": 0.40336042169997804, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.5378968715667725, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6957160830497742, + "num_tokens": 95034266.0, + "step": 3673 + }, + { + "epoch": 0.4034702394025917, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.574861526489258, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.688673198223114, + "num_tokens": 95055943.0, + "step": 3674 + }, + { + "epoch": 0.40358005710520534, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.331226110458374, + "learning_rate": 1e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.6794660687446594, + "num_tokens": 95083037.0, + "step": 3675 + }, + { + "epoch": 0.40368987480781904, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.524719476699829, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6891846060752869, + "num_tokens": 95109674.0, + "step": 3676 + }, + { + "epoch": 0.4037996925104327, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.7627525329589844, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7012905478477478, + "num_tokens": 95128170.0, + "step": 3677 + }, + { + "epoch": 0.40390951021304633, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.25522780418396, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6911087036132812, + "num_tokens": 95158536.0, + "step": 3678 + }, + { + "epoch": 0.40401932791566003, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.1754374504089355, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7039203643798828, + "num_tokens": 95188713.0, + "step": 3679 + }, + { + "epoch": 0.4041291456182737, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.4544646739959717, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.670647382736206, + "num_tokens": 95214405.0, + "step": 3680 + }, + { + "epoch": 0.4042389633208873, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.0890116691589355, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7189953923225403, + "num_tokens": 95244165.0, + "step": 3681 + }, + { + "epoch": 0.40434878102350097, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.3796048164367676, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7124924659729004, + "num_tokens": 95267477.0, + "step": 3682 + }, + { + "epoch": 0.40445859872611467, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.259211540222168, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6883842945098877, + "num_tokens": 95295044.0, + "step": 3683 + }, + { + "epoch": 0.4045684164287283, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.526395320892334, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6967610120773315, + "num_tokens": 95319298.0, + "step": 3684 + }, + { + "epoch": 0.40467823413134196, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.334700345993042, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7102094888687134, + "num_tokens": 95344447.0, + "step": 3685 + }, + { + "epoch": 0.4047880518339556, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.654688835144043, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7011070847511292, + "num_tokens": 95364424.0, + "step": 3686 + }, + { + "epoch": 0.4048978695365693, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.5227866172790527, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6918081045150757, + "num_tokens": 95387166.0, + "step": 3687 + }, + { + "epoch": 0.40500768723918296, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.347581386566162, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6903538703918457, + "num_tokens": 95414783.0, + "step": 3688 + }, + { + "epoch": 0.4051175049417966, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.3576276302337646, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6967675685882568, + "num_tokens": 95440817.0, + "step": 3689 + }, + { + "epoch": 0.4052273226444103, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.503602981567383, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7011556625366211, + "num_tokens": 95463800.0, + "step": 3690 + }, + { + "epoch": 0.40533714034702395, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.3504838943481445, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7080614566802979, + "num_tokens": 95493112.0, + "step": 3691 + }, + { + "epoch": 0.4054469580496376, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.2683582305908203, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7061245441436768, + "num_tokens": 95519677.0, + "step": 3692 + }, + { + "epoch": 0.40555677575225124, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.1895992755889893, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6927368640899658, + "num_tokens": 95549291.0, + "step": 3693 + }, + { + "epoch": 0.40566659345486494, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.379302501678467, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7144582271575928, + "num_tokens": 95575323.0, + "step": 3694 + }, + { + "epoch": 0.4057764111574786, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.3342771530151367, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7083282470703125, + "num_tokens": 95601929.0, + "step": 3695 + }, + { + "epoch": 0.40588622886009224, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.1703567504882812, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6818068027496338, + "num_tokens": 95632256.0, + "step": 3696 + }, + { + "epoch": 0.4059960465627059, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.350651741027832, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7096936702728271, + "num_tokens": 95657228.0, + "step": 3697 + }, + { + "epoch": 0.4061058642653196, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.353276252746582, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6798721551895142, + "num_tokens": 95683329.0, + "step": 3698 + }, + { + "epoch": 0.40621568196793323, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.232346296310425, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6917119026184082, + "num_tokens": 95712366.0, + "step": 3699 + }, + { + "epoch": 0.4063254996705469, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.143373966217041, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6914084553718567, + "num_tokens": 95741651.0, + "step": 3700 + }, + { + "epoch": 0.4064353173731606, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.184391736984253, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6998836994171143, + "num_tokens": 95773832.0, + "step": 3701 + }, + { + "epoch": 0.4065451350757742, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.303657293319702, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7012072801589966, + "num_tokens": 95801416.0, + "step": 3702 + }, + { + "epoch": 0.40665495277838787, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 2.4277420043945312, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7011154890060425, + "num_tokens": 95826058.0, + "step": 3703 + }, + { + "epoch": 0.4067647704810015, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.052541971206665, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6896581053733826, + "num_tokens": 95859076.0, + "step": 3704 + }, + { + "epoch": 0.4068745881836152, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3219361305236816, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7020407915115356, + "num_tokens": 95884058.0, + "step": 3705 + }, + { + "epoch": 0.40698440588622886, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.449002981185913, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7118749022483826, + "num_tokens": 95906494.0, + "step": 3706 + }, + { + "epoch": 0.4070942235888425, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2181484699249268, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6886774301528931, + "num_tokens": 95933524.0, + "step": 3707 + }, + { + "epoch": 0.4072040412914562, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.367044687271118, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7085002064704895, + "num_tokens": 95957723.0, + "step": 3708 + }, + { + "epoch": 0.40731385899406986, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.46264386177063, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6980479955673218, + "num_tokens": 95980974.0, + "step": 3709 + }, + { + "epoch": 0.4074236766966835, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2022435665130615, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6969096064567566, + "num_tokens": 96010658.0, + "step": 3710 + }, + { + "epoch": 0.40753349439929715, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.4484777450561523, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6795175671577454, + "num_tokens": 96036358.0, + "step": 3711 + }, + { + "epoch": 0.40764331210191085, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.33723521232605, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6927272081375122, + "num_tokens": 96062798.0, + "step": 3712 + }, + { + "epoch": 0.4077531298045245, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2959468364715576, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6900827884674072, + "num_tokens": 96089971.0, + "step": 3713 + }, + { + "epoch": 0.40786294750713814, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3305323123931885, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6879304647445679, + "num_tokens": 96116747.0, + "step": 3714 + }, + { + "epoch": 0.4079727652097518, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.4181978702545166, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6854016184806824, + "num_tokens": 96142612.0, + "step": 3715 + }, + { + "epoch": 0.4080825829123655, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.839080333709717, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6927301287651062, + "num_tokens": 96167683.0, + "step": 3716 + }, + { + "epoch": 0.40819240061497913, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2843217849731445, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7105592489242554, + "num_tokens": 96195221.0, + "step": 3717 + }, + { + "epoch": 0.4083022183175928, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.281428813934326, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7122399806976318, + "num_tokens": 96221480.0, + "step": 3718 + }, + { + "epoch": 0.4084120360202065, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.689321994781494, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.69782954454422, + "num_tokens": 96242839.0, + "step": 3719 + }, + { + "epoch": 0.4085218537228201, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.702375888824463, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6907931566238403, + "num_tokens": 96264354.0, + "step": 3720 + }, + { + "epoch": 0.4086316714254338, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.183736801147461, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7147151827812195, + "num_tokens": 96292974.0, + "step": 3721 + }, + { + "epoch": 0.4087414891280474, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.569024085998535, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7037156224250793, + "num_tokens": 96317311.0, + "step": 3722 + }, + { + "epoch": 0.4088513068306611, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.390465497970581, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6775795221328735, + "num_tokens": 96344098.0, + "step": 3723 + }, + { + "epoch": 0.40896112453327477, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.5310850143432617, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6982161998748779, + "num_tokens": 96368361.0, + "step": 3724 + }, + { + "epoch": 0.4090709422358884, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2847084999084473, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7094591856002808, + "num_tokens": 96394469.0, + "step": 3725 + }, + { + "epoch": 0.40918075993850206, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3944926261901855, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6919622421264648, + "num_tokens": 96419691.0, + "step": 3726 + }, + { + "epoch": 0.40929057764111576, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.0494136810302734, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6983226537704468, + "num_tokens": 96453773.0, + "step": 3727 + }, + { + "epoch": 0.4094003953437294, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.4052324295043945, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7206724882125854, + "num_tokens": 96478387.0, + "step": 3728 + }, + { + "epoch": 0.40951021304634305, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3285202980041504, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6950554251670837, + "num_tokens": 96504213.0, + "step": 3729 + }, + { + "epoch": 0.40962003074895675, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.133704662322998, + "learning_rate": 1e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6856751441955566, + "num_tokens": 96536107.0, + "step": 3730 + }, + { + "epoch": 0.4097298484515704, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.351986885070801, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7075978517532349, + "num_tokens": 96561775.0, + "step": 3731 + }, + { + "epoch": 0.40983966615418405, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.5139803886413574, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.716558575630188, + "num_tokens": 96584036.0, + "step": 3732 + }, + { + "epoch": 0.4099494838567977, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.80410099029541, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.714855432510376, + "num_tokens": 96605831.0, + "step": 3733 + }, + { + "epoch": 0.4100593015594114, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.622992992401123, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7034256458282471, + "num_tokens": 96627335.0, + "step": 3734 + }, + { + "epoch": 0.41016911926202504, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3272571563720703, + "learning_rate": 1e-06, + "loss": 1.0968, + "mean_token_accuracy": 0.6889352202415466, + "num_tokens": 96655776.0, + "step": 3735 + }, + { + "epoch": 0.4102789369646387, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.2001283168792725, + "learning_rate": 1e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.6830482482910156, + "num_tokens": 96686329.0, + "step": 3736 + }, + { + "epoch": 0.4103887546672524, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.073223114013672, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7093764543533325, + "num_tokens": 96717402.0, + "step": 3737 + }, + { + "epoch": 0.41049857236986603, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.353930711746216, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7117026448249817, + "num_tokens": 96743834.0, + "step": 3738 + }, + { + "epoch": 0.4106083900724797, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.4687399864196777, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6948404312133789, + "num_tokens": 96766567.0, + "step": 3739 + }, + { + "epoch": 0.4107182077750933, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 2.3490192890167236, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6914552450180054, + "num_tokens": 96793264.0, + "step": 3740 + }, + { + "epoch": 0.410828025477707, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.1642353534698486, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6904373168945312, + "num_tokens": 96825333.0, + "step": 3741 + }, + { + "epoch": 0.41093784318032067, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.176042318344116, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7190245985984802, + "num_tokens": 96853811.0, + "step": 3742 + }, + { + "epoch": 0.4110476608829343, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.443885564804077, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6987221240997314, + "num_tokens": 96878822.0, + "step": 3743 + }, + { + "epoch": 0.41115747858554796, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.609744071960449, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7283468842506409, + "num_tokens": 96901200.0, + "step": 3744 + }, + { + "epoch": 0.41126729628816167, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.7220776081085205, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.7005397081375122, + "num_tokens": 96920484.0, + "step": 3745 + }, + { + "epoch": 0.4113771139907753, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.46492600440979, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7020060420036316, + "num_tokens": 96944095.0, + "step": 3746 + }, + { + "epoch": 0.41148693169338896, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.681979179382324, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6999901533126831, + "num_tokens": 96964442.0, + "step": 3747 + }, + { + "epoch": 0.41159674939600266, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.282790184020996, + "learning_rate": 1e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6850400567054749, + "num_tokens": 96994310.0, + "step": 3748 + }, + { + "epoch": 0.4117065670986163, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.686659336090088, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.714516282081604, + "num_tokens": 97015086.0, + "step": 3749 + }, + { + "epoch": 0.41181638480122995, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.599214792251587, + "learning_rate": 1e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.682540237903595, + "num_tokens": 97038406.0, + "step": 3750 + }, + { + "epoch": 0.4119262025038436, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.550896644592285, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6910550594329834, + "num_tokens": 97063504.0, + "step": 3751 + }, + { + "epoch": 0.4120360202064573, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.571547508239746, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6784895658493042, + "num_tokens": 97087001.0, + "step": 3752 + }, + { + "epoch": 0.41214583790907094, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.218194007873535, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7102637887001038, + "num_tokens": 97114967.0, + "step": 3753 + }, + { + "epoch": 0.4122556556116846, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4442198276519775, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7191797494888306, + "num_tokens": 97137484.0, + "step": 3754 + }, + { + "epoch": 0.4123654733142983, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.2353997230529785, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7005890607833862, + "num_tokens": 97164287.0, + "step": 3755 + }, + { + "epoch": 0.41247529101691194, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.572913408279419, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7211365699768066, + "num_tokens": 97185153.0, + "step": 3756 + }, + { + "epoch": 0.4125851087195256, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.292630910873413, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.695732593536377, + "num_tokens": 97211905.0, + "step": 3757 + }, + { + "epoch": 0.41269492642213923, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.334481716156006, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6957254409790039, + "num_tokens": 97238603.0, + "step": 3758 + }, + { + "epoch": 0.41280474412475293, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.0436952114105225, + "learning_rate": 1e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.6770805716514587, + "num_tokens": 97271073.0, + "step": 3759 + }, + { + "epoch": 0.4129145618273666, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.349307060241699, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6984809637069702, + "num_tokens": 97297350.0, + "step": 3760 + }, + { + "epoch": 0.4130243795299802, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.3936526775360107, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7042779922485352, + "num_tokens": 97321488.0, + "step": 3761 + }, + { + "epoch": 0.41313419723259387, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.280792474746704, + "learning_rate": 1e-06, + "loss": 1.1326, + "mean_token_accuracy": 0.6806503534317017, + "num_tokens": 97349133.0, + "step": 3762 + }, + { + "epoch": 0.41324401493520757, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.3686110973358154, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6972876787185669, + "num_tokens": 97373337.0, + "step": 3763 + }, + { + "epoch": 0.4133538326378212, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.739213228225708, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.731364905834198, + "num_tokens": 97393305.0, + "step": 3764 + }, + { + "epoch": 0.41346365034043486, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.3238048553466797, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7000554800033569, + "num_tokens": 97420834.0, + "step": 3765 + }, + { + "epoch": 0.41357346804304856, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 2.3692102432250977, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6932569742202759, + "num_tokens": 97447875.0, + "step": 3766 + }, + { + "epoch": 0.4136832857456622, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.581707715988159, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7160888910293579, + "num_tokens": 97468715.0, + "step": 3767 + }, + { + "epoch": 0.41379310344827586, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.5610153675079346, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6969801783561707, + "num_tokens": 97491436.0, + "step": 3768 + }, + { + "epoch": 0.4139029211508895, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.220067024230957, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6918895840644836, + "num_tokens": 97521771.0, + "step": 3769 + }, + { + "epoch": 0.4140127388535032, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3135788440704346, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6973109245300293, + "num_tokens": 97548123.0, + "step": 3770 + }, + { + "epoch": 0.41412255655611685, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.419605255126953, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7186092138290405, + "num_tokens": 97569764.0, + "step": 3771 + }, + { + "epoch": 0.4142323742587305, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.256460189819336, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6968324184417725, + "num_tokens": 97598522.0, + "step": 3772 + }, + { + "epoch": 0.41434219196134414, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.338226079940796, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7335999011993408, + "num_tokens": 97623658.0, + "step": 3773 + }, + { + "epoch": 0.41445200966395784, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.144594430923462, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6868707537651062, + "num_tokens": 97654071.0, + "step": 3774 + }, + { + "epoch": 0.4145618273665715, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4791300296783447, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6946430206298828, + "num_tokens": 97677831.0, + "step": 3775 + }, + { + "epoch": 0.41467164506918514, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4653327465057373, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6848629117012024, + "num_tokens": 97702503.0, + "step": 3776 + }, + { + "epoch": 0.41478146277179884, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.581372022628784, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6865518093109131, + "num_tokens": 97724802.0, + "step": 3777 + }, + { + "epoch": 0.4148912804744125, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4763102531433105, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7115570306777954, + "num_tokens": 97749234.0, + "step": 3778 + }, + { + "epoch": 0.41500109817702613, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.484881639480591, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7102565765380859, + "num_tokens": 97770698.0, + "step": 3779 + }, + { + "epoch": 0.4151109158796398, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.457796812057495, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6996411681175232, + "num_tokens": 97794172.0, + "step": 3780 + }, + { + "epoch": 0.4152207335822535, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.690476655960083, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.700491189956665, + "num_tokens": 97815066.0, + "step": 3781 + }, + { + "epoch": 0.4153305512848671, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.3589489459991455, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7251459360122681, + "num_tokens": 97841199.0, + "step": 3782 + }, + { + "epoch": 0.41544036898748077, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4521994590759277, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.691834568977356, + "num_tokens": 97865472.0, + "step": 3783 + }, + { + "epoch": 0.41555018669009447, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.562361478805542, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7107787132263184, + "num_tokens": 97886053.0, + "step": 3784 + }, + { + "epoch": 0.4156600043927081, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.6216654777526855, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7023388147354126, + "num_tokens": 97908195.0, + "step": 3785 + }, + { + "epoch": 0.41576982209532176, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3091094493865967, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7070260047912598, + "num_tokens": 97935081.0, + "step": 3786 + }, + { + "epoch": 0.4158796397979354, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.174708604812622, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6779586672782898, + "num_tokens": 97966325.0, + "step": 3787 + }, + { + "epoch": 0.4159894575005491, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4956812858581543, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6983475685119629, + "num_tokens": 97991457.0, + "step": 3788 + }, + { + "epoch": 0.41609927520316276, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.301105260848999, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6856839060783386, + "num_tokens": 98018341.0, + "step": 3789 + }, + { + "epoch": 0.4162090929057764, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.2780768871307373, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7050563097000122, + "num_tokens": 98045098.0, + "step": 3790 + }, + { + "epoch": 0.41631891060839005, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4784369468688965, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6847469210624695, + "num_tokens": 98069357.0, + "step": 3791 + }, + { + "epoch": 0.41642872831100375, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4590868949890137, + "learning_rate": 1e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.679273247718811, + "num_tokens": 98094848.0, + "step": 3792 + }, + { + "epoch": 0.4165385460136174, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5415141582489014, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.684253454208374, + "num_tokens": 98118179.0, + "step": 3793 + }, + { + "epoch": 0.41664836371623104, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4854750633239746, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6945507526397705, + "num_tokens": 98142211.0, + "step": 3794 + }, + { + "epoch": 0.41675818141884474, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4393062591552734, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6944060325622559, + "num_tokens": 98168425.0, + "step": 3795 + }, + { + "epoch": 0.4168679991214584, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.703179359436035, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7115722298622131, + "num_tokens": 98191140.0, + "step": 3796 + }, + { + "epoch": 0.41697781682407203, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.232789993286133, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7098333835601807, + "num_tokens": 98221268.0, + "step": 3797 + }, + { + "epoch": 0.4170876345266857, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5204920768737793, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7115073800086975, + "num_tokens": 98243642.0, + "step": 3798 + }, + { + "epoch": 0.4171974522292994, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.448014259338379, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7282207608222961, + "num_tokens": 98265808.0, + "step": 3799 + }, + { + "epoch": 0.417307269931913, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.11991286277771, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.709260106086731, + "num_tokens": 98293589.0, + "step": 3800 + }, + { + "epoch": 0.4174170876345267, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.340815782546997, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6920733451843262, + "num_tokens": 98321281.0, + "step": 3801 + }, + { + "epoch": 0.4175269053371403, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.355471611022949, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.6769025325775146, + "num_tokens": 98347302.0, + "step": 3802 + }, + { + "epoch": 0.417636723039754, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.367835521697998, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.675694465637207, + "num_tokens": 98375310.0, + "step": 3803 + }, + { + "epoch": 0.41774654074236767, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.086426258087158, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7139490842819214, + "num_tokens": 98405962.0, + "step": 3804 + }, + { + "epoch": 0.4178563584449813, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.4216842651367188, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7325706481933594, + "num_tokens": 98429344.0, + "step": 3805 + }, + { + "epoch": 0.417966176147595, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.391662120819092, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7005136013031006, + "num_tokens": 98455018.0, + "step": 3806 + }, + { + "epoch": 0.41807599385020866, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.1803624629974365, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.7069722414016724, + "num_tokens": 98484221.0, + "step": 3807 + }, + { + "epoch": 0.4181858115528223, + "ewc_loss": 1.043081283569336e-05, + "grad_norm": 2.1365935802459717, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6946609616279602, + "num_tokens": 98515513.0, + "step": 3808 + }, + { + "epoch": 0.41829562925543595, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.610407590866089, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7132768630981445, + "num_tokens": 98537078.0, + "step": 3809 + }, + { + "epoch": 0.41840544695804965, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.638514995574951, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7036207318305969, + "num_tokens": 98556703.0, + "step": 3810 + }, + { + "epoch": 0.4185152646606633, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.266932725906372, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6874677538871765, + "num_tokens": 98586006.0, + "step": 3811 + }, + { + "epoch": 0.41862508236327695, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.460665225982666, + "learning_rate": 1e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.680362343788147, + "num_tokens": 98609590.0, + "step": 3812 + }, + { + "epoch": 0.41873490006589065, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.520875930786133, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7061400413513184, + "num_tokens": 98632955.0, + "step": 3813 + }, + { + "epoch": 0.4188447177685043, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.594926118850708, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6973078846931458, + "num_tokens": 98654276.0, + "step": 3814 + }, + { + "epoch": 0.41895453547111794, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.628143072128296, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6983463168144226, + "num_tokens": 98676588.0, + "step": 3815 + }, + { + "epoch": 0.4190643531737316, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.2394444942474365, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6903512477874756, + "num_tokens": 98704836.0, + "step": 3816 + }, + { + "epoch": 0.4191741708763453, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.2466671466827393, + "learning_rate": 1e-06, + "loss": 1.13, + "mean_token_accuracy": 0.6693254113197327, + "num_tokens": 98736500.0, + "step": 3817 + }, + { + "epoch": 0.41928398857895893, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.646716833114624, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6968396306037903, + "num_tokens": 98758217.0, + "step": 3818 + }, + { + "epoch": 0.4193938062815726, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.530996084213257, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6785238981246948, + "num_tokens": 98782642.0, + "step": 3819 + }, + { + "epoch": 0.4195036239841862, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3664937019348145, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6986602544784546, + "num_tokens": 98809715.0, + "step": 3820 + }, + { + "epoch": 0.4196134416867999, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.521444320678711, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7053495645523071, + "num_tokens": 98832396.0, + "step": 3821 + }, + { + "epoch": 0.41972325938941357, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5673165321350098, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.7060815691947937, + "num_tokens": 98855032.0, + "step": 3822 + }, + { + "epoch": 0.4198330770920272, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.167776107788086, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.681523323059082, + "num_tokens": 98885502.0, + "step": 3823 + }, + { + "epoch": 0.4199428947946409, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4331743717193604, + "learning_rate": 1e-06, + "loss": 1.1049, + "mean_token_accuracy": 0.6807482242584229, + "num_tokens": 98910833.0, + "step": 3824 + }, + { + "epoch": 0.42005271249725457, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.306269884109497, + "learning_rate": 1e-06, + "loss": 1.1038, + "mean_token_accuracy": 0.6769670844078064, + "num_tokens": 98939922.0, + "step": 3825 + }, + { + "epoch": 0.4201625301998682, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4225707054138184, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6969976425170898, + "num_tokens": 98966447.0, + "step": 3826 + }, + { + "epoch": 0.42027234790248186, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.376044988632202, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7007019519805908, + "num_tokens": 98990069.0, + "step": 3827 + }, + { + "epoch": 0.42038216560509556, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.427175521850586, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7105692028999329, + "num_tokens": 99018779.0, + "step": 3828 + }, + { + "epoch": 0.4204919833077092, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.252993106842041, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6803290843963623, + "num_tokens": 99046989.0, + "step": 3829 + }, + { + "epoch": 0.42060180101032285, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3192193508148193, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.689350962638855, + "num_tokens": 99075830.0, + "step": 3830 + }, + { + "epoch": 0.42071161871293655, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.406463623046875, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6918177008628845, + "num_tokens": 99104043.0, + "step": 3831 + }, + { + "epoch": 0.4208214364155502, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.210237741470337, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6889010667800903, + "num_tokens": 99134589.0, + "step": 3832 + }, + { + "epoch": 0.42093125411816384, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4487457275390625, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6868098974227905, + "num_tokens": 99159009.0, + "step": 3833 + }, + { + "epoch": 0.4210410718207775, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.2756741046905518, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.685588002204895, + "num_tokens": 99188690.0, + "step": 3834 + }, + { + "epoch": 0.4211508895233912, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.529832601547241, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7197810411453247, + "num_tokens": 99209626.0, + "step": 3835 + }, + { + "epoch": 0.42126070722600484, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.451448917388916, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6915138959884644, + "num_tokens": 99237195.0, + "step": 3836 + }, + { + "epoch": 0.4213705249286185, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.3608596324920654, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7152683138847351, + "num_tokens": 99265505.0, + "step": 3837 + }, + { + "epoch": 0.42148034263123213, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.4226744174957275, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7164748311042786, + "num_tokens": 99290547.0, + "step": 3838 + }, + { + "epoch": 0.42159016033384583, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5720784664154053, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7006842494010925, + "num_tokens": 99313701.0, + "step": 3839 + }, + { + "epoch": 0.4216999780364595, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.3123490810394287, + "learning_rate": 1e-06, + "loss": 1.1074, + "mean_token_accuracy": 0.6756836175918579, + "num_tokens": 99344109.0, + "step": 3840 + }, + { + "epoch": 0.4218097957390731, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3561348915100098, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7060872316360474, + "num_tokens": 99370893.0, + "step": 3841 + }, + { + "epoch": 0.4219196134416868, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4241795539855957, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7018365859985352, + "num_tokens": 99393832.0, + "step": 3842 + }, + { + "epoch": 0.42202943114430047, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.589761257171631, + "learning_rate": 1e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.6656678915023804, + "num_tokens": 99418060.0, + "step": 3843 + }, + { + "epoch": 0.4221392488469141, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.468482494354248, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7098934054374695, + "num_tokens": 99441684.0, + "step": 3844 + }, + { + "epoch": 0.42224906654952776, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.335162401199341, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7012197375297546, + "num_tokens": 99469380.0, + "step": 3845 + }, + { + "epoch": 0.42235888425214146, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.916914939880371, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7268441319465637, + "num_tokens": 99487015.0, + "step": 3846 + }, + { + "epoch": 0.4224687019547551, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.412910223007202, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6865009665489197, + "num_tokens": 99513084.0, + "step": 3847 + }, + { + "epoch": 0.42257851965736876, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4429235458374023, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6895546913146973, + "num_tokens": 99538790.0, + "step": 3848 + }, + { + "epoch": 0.4226883373599824, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.3730902671813965, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7090348601341248, + "num_tokens": 99563439.0, + "step": 3849 + }, + { + "epoch": 0.4227981550625961, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.414397716522217, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7227901816368103, + "num_tokens": 99586655.0, + "step": 3850 + }, + { + "epoch": 0.42290797276520975, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.364712715148926, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6836836338043213, + "num_tokens": 99611237.0, + "step": 3851 + }, + { + "epoch": 0.4230177904678234, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.0851306915283203, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6928364634513855, + "num_tokens": 99645810.0, + "step": 3852 + }, + { + "epoch": 0.4231276081704371, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.7168643474578857, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6971246004104614, + "num_tokens": 99664956.0, + "step": 3853 + }, + { + "epoch": 0.42323742587305074, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.212470054626465, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7143797278404236, + "num_tokens": 99693325.0, + "step": 3854 + }, + { + "epoch": 0.4233472435756644, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5204901695251465, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.716581404209137, + "num_tokens": 99715867.0, + "step": 3855 + }, + { + "epoch": 0.42345706127827804, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.314648151397705, + "learning_rate": 1e-06, + "loss": 1.1305, + "mean_token_accuracy": 0.6721469163894653, + "num_tokens": 99745782.0, + "step": 3856 + }, + { + "epoch": 0.42356687898089174, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.513481855392456, + "learning_rate": 1e-06, + "loss": 1.0812, + "mean_token_accuracy": 0.6819616556167603, + "num_tokens": 99768440.0, + "step": 3857 + }, + { + "epoch": 0.4236766966835054, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5591278076171875, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6922622919082642, + "num_tokens": 99789971.0, + "step": 3858 + }, + { + "epoch": 0.42378651438611903, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5130670070648193, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6980390548706055, + "num_tokens": 99813570.0, + "step": 3859 + }, + { + "epoch": 0.42389633208873273, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.486926555633545, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6854355335235596, + "num_tokens": 99837657.0, + "step": 3860 + }, + { + "epoch": 0.4240061497913464, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.2737653255462646, + "learning_rate": 1e-06, + "loss": 1.1435, + "mean_token_accuracy": 0.6643834114074707, + "num_tokens": 99868728.0, + "step": 3861 + }, + { + "epoch": 0.42411596749396, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.361637592315674, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6817194223403931, + "num_tokens": 99896969.0, + "step": 3862 + }, + { + "epoch": 0.42422578519657367, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.551297426223755, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7084469795227051, + "num_tokens": 99919296.0, + "step": 3863 + }, + { + "epoch": 0.42433560289918737, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.6549978256225586, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7072864770889282, + "num_tokens": 99941534.0, + "step": 3864 + }, + { + "epoch": 0.424445420601801, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.255094051361084, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.705452561378479, + "num_tokens": 99968505.0, + "step": 3865 + }, + { + "epoch": 0.42455523830441466, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.3134288787841797, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7302976846694946, + "num_tokens": 99993455.0, + "step": 3866 + }, + { + "epoch": 0.4246650560070283, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.7307024002075195, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6986314058303833, + "num_tokens": 100018287.0, + "step": 3867 + }, + { + "epoch": 0.424774873709642, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.6491618156433105, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7083950638771057, + "num_tokens": 100040541.0, + "step": 3868 + }, + { + "epoch": 0.42488469141225566, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.283057451248169, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7170735597610474, + "num_tokens": 100066442.0, + "step": 3869 + }, + { + "epoch": 0.4249945091148693, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4919724464416504, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7504504919052124, + "num_tokens": 100086709.0, + "step": 3870 + }, + { + "epoch": 0.425104326817483, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4773824214935303, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7117346525192261, + "num_tokens": 100110542.0, + "step": 3871 + }, + { + "epoch": 0.42521414452009665, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.189789056777954, + "learning_rate": 1e-06, + "loss": 1.1353, + "mean_token_accuracy": 0.672616720199585, + "num_tokens": 100141320.0, + "step": 3872 + }, + { + "epoch": 0.4253239622227103, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.5277578830718994, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.684890627861023, + "num_tokens": 100164223.0, + "step": 3873 + }, + { + "epoch": 0.42543377992532394, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.5702402591705322, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7121055722236633, + "num_tokens": 100185941.0, + "step": 3874 + }, + { + "epoch": 0.42554359762793764, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.29365611076355, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7090400457382202, + "num_tokens": 100210732.0, + "step": 3875 + }, + { + "epoch": 0.4256534153305513, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.276301383972168, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6816874742507935, + "num_tokens": 100238137.0, + "step": 3876 + }, + { + "epoch": 0.42576323303316493, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.345518112182617, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6886645555496216, + "num_tokens": 100264186.0, + "step": 3877 + }, + { + "epoch": 0.4258730507357786, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.1459741592407227, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6970856189727783, + "num_tokens": 100294331.0, + "step": 3878 + }, + { + "epoch": 0.4259828684383923, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 2.4576683044433594, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7325447201728821, + "num_tokens": 100316130.0, + "step": 3879 + }, + { + "epoch": 0.4260926861410059, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.496103048324585, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6992653012275696, + "num_tokens": 100340052.0, + "step": 3880 + }, + { + "epoch": 0.4262025038436196, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.4929914474487305, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6962471604347229, + "num_tokens": 100365026.0, + "step": 3881 + }, + { + "epoch": 0.4263123215462333, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.5175206661224365, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6950463056564331, + "num_tokens": 100386885.0, + "step": 3882 + }, + { + "epoch": 0.4264221392488469, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.347966432571411, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6819486618041992, + "num_tokens": 100413857.0, + "step": 3883 + }, + { + "epoch": 0.42653195695146057, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.9863982200622559, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7001692056655884, + "num_tokens": 100448443.0, + "step": 3884 + }, + { + "epoch": 0.4266417746540742, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.384310483932495, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7278217673301697, + "num_tokens": 100473036.0, + "step": 3885 + }, + { + "epoch": 0.4267515923566879, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.6959636211395264, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6962584257125854, + "num_tokens": 100492808.0, + "step": 3886 + }, + { + "epoch": 0.42686141005930156, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.4748010635375977, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6852216720581055, + "num_tokens": 100516357.0, + "step": 3887 + }, + { + "epoch": 0.4269712277619152, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.3660402297973633, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.706545352935791, + "num_tokens": 100539907.0, + "step": 3888 + }, + { + "epoch": 0.4270810454645289, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.0817813873291016, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.713454008102417, + "num_tokens": 100570127.0, + "step": 3889 + }, + { + "epoch": 0.42719086316714255, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.9880454540252686, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7210304737091064, + "num_tokens": 100587307.0, + "step": 3890 + }, + { + "epoch": 0.4273006808697562, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.580950975418091, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6940306425094604, + "num_tokens": 100610619.0, + "step": 3891 + }, + { + "epoch": 0.42741049857236985, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.251295566558838, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6929020285606384, + "num_tokens": 100639567.0, + "step": 3892 + }, + { + "epoch": 0.42752031627498355, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.60115122795105, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7032842636108398, + "num_tokens": 100659524.0, + "step": 3893 + }, + { + "epoch": 0.4276301339775972, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.5111019611358643, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7085036039352417, + "num_tokens": 100681819.0, + "step": 3894 + }, + { + "epoch": 0.42773995168021084, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.063918113708496, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7007973194122314, + "num_tokens": 100715905.0, + "step": 3895 + }, + { + "epoch": 0.4278497693828245, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.123627185821533, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6921027898788452, + "num_tokens": 100751228.0, + "step": 3896 + }, + { + "epoch": 0.4279595870854382, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.579949378967285, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7016595602035522, + "num_tokens": 100773363.0, + "step": 3897 + }, + { + "epoch": 0.42806940478805183, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.171645164489746, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6883373260498047, + "num_tokens": 100801727.0, + "step": 3898 + }, + { + "epoch": 0.4281792224906655, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.0719597339630127, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7029939889907837, + "num_tokens": 100834173.0, + "step": 3899 + }, + { + "epoch": 0.4282890401932792, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.7265470027923584, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6985541582107544, + "num_tokens": 100855435.0, + "step": 3900 + }, + { + "epoch": 0.4283988578958928, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.1952567100524902, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6902369260787964, + "num_tokens": 100884327.0, + "step": 3901 + }, + { + "epoch": 0.42850867559850647, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.325054168701172, + "learning_rate": 1e-06, + "loss": 1.0886, + "mean_token_accuracy": 0.6753978729248047, + "num_tokens": 100910211.0, + "step": 3902 + }, + { + "epoch": 0.4286184933011201, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.428934097290039, + "learning_rate": 1e-06, + "loss": 1.091, + "mean_token_accuracy": 0.6835848093032837, + "num_tokens": 100935401.0, + "step": 3903 + }, + { + "epoch": 0.4287283110037338, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.5670981407165527, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.711746096611023, + "num_tokens": 100958877.0, + "step": 3904 + }, + { + "epoch": 0.42883812870634747, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.479539155960083, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7114959955215454, + "num_tokens": 100980312.0, + "step": 3905 + }, + { + "epoch": 0.4289479464089611, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.1536827087402344, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7157372832298279, + "num_tokens": 101008596.0, + "step": 3906 + }, + { + "epoch": 0.4290577641115748, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.2925069332122803, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7162924408912659, + "num_tokens": 101033996.0, + "step": 3907 + }, + { + "epoch": 0.42916758181418846, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.4001567363739014, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7039928436279297, + "num_tokens": 101057963.0, + "step": 3908 + }, + { + "epoch": 0.4292773995168021, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.854045867919922, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6935421824455261, + "num_tokens": 101077804.0, + "step": 3909 + }, + { + "epoch": 0.42938721721941575, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.7311222553253174, + "learning_rate": 1e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.6854194402694702, + "num_tokens": 101100214.0, + "step": 3910 + }, + { + "epoch": 0.42949703492202945, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.563204765319824, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7239203453063965, + "num_tokens": 101124305.0, + "step": 3911 + }, + { + "epoch": 0.4296068526246431, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.2834420204162598, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7192957401275635, + "num_tokens": 101150289.0, + "step": 3912 + }, + { + "epoch": 0.42971667032725674, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.352417230606079, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6888368725776672, + "num_tokens": 101178339.0, + "step": 3913 + }, + { + "epoch": 0.4298264880298704, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.4268698692321777, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7127751111984253, + "num_tokens": 101202614.0, + "step": 3914 + }, + { + "epoch": 0.4299363057324841, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.387770652770996, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7198134660720825, + "num_tokens": 101224528.0, + "step": 3915 + }, + { + "epoch": 0.43004612343509774, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.264768600463867, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7314041256904602, + "num_tokens": 101249037.0, + "step": 3916 + }, + { + "epoch": 0.4301559411377114, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.1273863315582275, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6933704614639282, + "num_tokens": 101282444.0, + "step": 3917 + }, + { + "epoch": 0.4302657588403251, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.3326454162597656, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7067312002182007, + "num_tokens": 101313358.0, + "step": 3918 + }, + { + "epoch": 0.43037557654293873, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 2.2337112426757812, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.697091817855835, + "num_tokens": 101340581.0, + "step": 3919 + }, + { + "epoch": 0.4304853942455524, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.6293447017669678, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7149350643157959, + "num_tokens": 101361533.0, + "step": 3920 + }, + { + "epoch": 0.430595211948166, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4967992305755615, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7091934084892273, + "num_tokens": 101383326.0, + "step": 3921 + }, + { + "epoch": 0.4307050296507797, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3944091796875, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.7043923139572144, + "num_tokens": 101409081.0, + "step": 3922 + }, + { + "epoch": 0.43081484735339337, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.207719326019287, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6964596509933472, + "num_tokens": 101438699.0, + "step": 3923 + }, + { + "epoch": 0.430924665056007, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.252742290496826, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7112051248550415, + "num_tokens": 101464864.0, + "step": 3924 + }, + { + "epoch": 0.43103448275862066, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.553835391998291, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6887969970703125, + "num_tokens": 101488078.0, + "step": 3925 + }, + { + "epoch": 0.43114430046123436, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3690719604492188, + "learning_rate": 1e-06, + "loss": 1.1162, + "mean_token_accuracy": 0.6794753074645996, + "num_tokens": 101514050.0, + "step": 3926 + }, + { + "epoch": 0.431254118163848, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.567704677581787, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7215265035629272, + "num_tokens": 101536723.0, + "step": 3927 + }, + { + "epoch": 0.43136393586646166, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.382746934890747, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6878368854522705, + "num_tokens": 101561970.0, + "step": 3928 + }, + { + "epoch": 0.43147375356907536, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2985482215881348, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.7018784880638123, + "num_tokens": 101589583.0, + "step": 3929 + }, + { + "epoch": 0.431583571271689, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.1620230674743652, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6820076107978821, + "num_tokens": 101621673.0, + "step": 3930 + }, + { + "epoch": 0.43169338897430265, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.5143253803253174, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6901246905326843, + "num_tokens": 101644065.0, + "step": 3931 + }, + { + "epoch": 0.4318032066769163, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.9779245853424072, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7283048629760742, + "num_tokens": 101662750.0, + "step": 3932 + }, + { + "epoch": 0.43191302437953, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2595629692077637, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7086668014526367, + "num_tokens": 101688990.0, + "step": 3933 + }, + { + "epoch": 0.43202284208214364, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.532790184020996, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6954430341720581, + "num_tokens": 101710686.0, + "step": 3934 + }, + { + "epoch": 0.4321326597847573, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3795931339263916, + "learning_rate": 1e-06, + "loss": 1.0837, + "mean_token_accuracy": 0.6892300844192505, + "num_tokens": 101737899.0, + "step": 3935 + }, + { + "epoch": 0.432242477487371, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2802047729492188, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6836135387420654, + "num_tokens": 101766454.0, + "step": 3936 + }, + { + "epoch": 0.43235229518998464, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.367690086364746, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.712944507598877, + "num_tokens": 101791654.0, + "step": 3937 + }, + { + "epoch": 0.4324621128925983, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.380192995071411, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6985479593276978, + "num_tokens": 101817589.0, + "step": 3938 + }, + { + "epoch": 0.43257193059521193, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.1483445167541504, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.691972553730011, + "num_tokens": 101847859.0, + "step": 3939 + }, + { + "epoch": 0.43268174829782563, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3967549800872803, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6987357139587402, + "num_tokens": 101871930.0, + "step": 3940 + }, + { + "epoch": 0.4327915660004393, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.458430290222168, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7024412751197815, + "num_tokens": 101898638.0, + "step": 3941 + }, + { + "epoch": 0.4329013837030529, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3979318141937256, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6906052827835083, + "num_tokens": 101926998.0, + "step": 3942 + }, + { + "epoch": 0.43301120140566657, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.6572370529174805, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6896196603775024, + "num_tokens": 101949123.0, + "step": 3943 + }, + { + "epoch": 0.43312101910828027, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4181535243988037, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7126776576042175, + "num_tokens": 101971407.0, + "step": 3944 + }, + { + "epoch": 0.4332308368108939, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.358431816101074, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.7015324831008911, + "num_tokens": 101997943.0, + "step": 3945 + }, + { + "epoch": 0.43334065451350756, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2200019359588623, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7049821019172668, + "num_tokens": 102025822.0, + "step": 3946 + }, + { + "epoch": 0.43345047221612126, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2673850059509277, + "learning_rate": 1e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6877737045288086, + "num_tokens": 102053305.0, + "step": 3947 + }, + { + "epoch": 0.4335602899187349, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3717122077941895, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7024179697036743, + "num_tokens": 102078836.0, + "step": 3948 + }, + { + "epoch": 0.43367010762134856, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2904598712921143, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.722357451915741, + "num_tokens": 102103001.0, + "step": 3949 + }, + { + "epoch": 0.4337799253239622, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.24782657623291, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6829642653465271, + "num_tokens": 102131895.0, + "step": 3950 + }, + { + "epoch": 0.4338897430265759, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.539134979248047, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7192172408103943, + "num_tokens": 102156023.0, + "step": 3951 + }, + { + "epoch": 0.43399956072918955, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.7128336429595947, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6854469180107117, + "num_tokens": 102176947.0, + "step": 3952 + }, + { + "epoch": 0.4341093784318032, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.44570255279541, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6974470019340515, + "num_tokens": 102203157.0, + "step": 3953 + }, + { + "epoch": 0.43421919613441684, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.5707857608795166, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7007452845573425, + "num_tokens": 102226106.0, + "step": 3954 + }, + { + "epoch": 0.43432901383703054, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.5304996967315674, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.693611204624176, + "num_tokens": 102250220.0, + "step": 3955 + }, + { + "epoch": 0.4344388315396442, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3626914024353027, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6782499551773071, + "num_tokens": 102276373.0, + "step": 3956 + }, + { + "epoch": 0.43454864924225783, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4611241817474365, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7075475454330444, + "num_tokens": 102298062.0, + "step": 3957 + }, + { + "epoch": 0.43465846694487154, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 8.585969924926758, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6991124749183655, + "num_tokens": 102324012.0, + "step": 3958 + }, + { + "epoch": 0.4347682846474852, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.341491222381592, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6849287152290344, + "num_tokens": 102352815.0, + "step": 3959 + }, + { + "epoch": 0.4348781023500988, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4700629711151123, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6969717741012573, + "num_tokens": 102378014.0, + "step": 3960 + }, + { + "epoch": 0.4349879200527125, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.44279408454895, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7273833751678467, + "num_tokens": 102402223.0, + "step": 3961 + }, + { + "epoch": 0.4350977377553262, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2686803340911865, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7348949909210205, + "num_tokens": 102429827.0, + "step": 3962 + }, + { + "epoch": 0.4352075554579398, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.20194411277771, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6943548321723938, + "num_tokens": 102458769.0, + "step": 3963 + }, + { + "epoch": 0.43531737316055347, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2557642459869385, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6946744918823242, + "num_tokens": 102487677.0, + "step": 3964 + }, + { + "epoch": 0.43542719086316717, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.2743074893951416, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.7040701508522034, + "num_tokens": 102516194.0, + "step": 3965 + }, + { + "epoch": 0.4355370085657808, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.6988542079925537, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7187850475311279, + "num_tokens": 102534115.0, + "step": 3966 + }, + { + "epoch": 0.43564682626839446, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4739794731140137, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7129919528961182, + "num_tokens": 102557009.0, + "step": 3967 + }, + { + "epoch": 0.4357566439710081, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.416322708129883, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6956863403320312, + "num_tokens": 102581438.0, + "step": 3968 + }, + { + "epoch": 0.4358664616736218, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.532480478286743, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7154720425605774, + "num_tokens": 102603822.0, + "step": 3969 + }, + { + "epoch": 0.43597627937623545, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.533297300338745, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7048485279083252, + "num_tokens": 102625582.0, + "step": 3970 + }, + { + "epoch": 0.4360860970788491, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.3281943798065186, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6859541535377502, + "num_tokens": 102655237.0, + "step": 3971 + }, + { + "epoch": 0.43619591478146275, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.331836700439453, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7053089737892151, + "num_tokens": 102681610.0, + "step": 3972 + }, + { + "epoch": 0.43630573248407645, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.185129404067993, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.680566668510437, + "num_tokens": 102711636.0, + "step": 3973 + }, + { + "epoch": 0.4364155501866901, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 2.4571757316589355, + "learning_rate": 1e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6773953437805176, + "num_tokens": 102740269.0, + "step": 3974 + }, + { + "epoch": 0.43652536788930374, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.5779449939727783, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7282458543777466, + "num_tokens": 102765140.0, + "step": 3975 + }, + { + "epoch": 0.43663518559191744, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.549938917160034, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6934775710105896, + "num_tokens": 102790550.0, + "step": 3976 + }, + { + "epoch": 0.4367450032945311, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.330514430999756, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6904017329216003, + "num_tokens": 102818283.0, + "step": 3977 + }, + { + "epoch": 0.43685482099714473, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.5161330699920654, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.696094274520874, + "num_tokens": 102839697.0, + "step": 3978 + }, + { + "epoch": 0.4369646386997584, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.2255029678344727, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6909341216087341, + "num_tokens": 102868038.0, + "step": 3979 + }, + { + "epoch": 0.4370744564023721, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.7088677883148193, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6955338716506958, + "num_tokens": 102890261.0, + "step": 3980 + }, + { + "epoch": 0.4371842741049857, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.713088035583496, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6824498176574707, + "num_tokens": 102912602.0, + "step": 3981 + }, + { + "epoch": 0.43729409180759937, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.404597520828247, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7183343768119812, + "num_tokens": 102936652.0, + "step": 3982 + }, + { + "epoch": 0.4374039095102131, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.2112393379211426, + "learning_rate": 1e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.6793274879455566, + "num_tokens": 102964564.0, + "step": 3983 + }, + { + "epoch": 0.4375137272128267, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.30122709274292, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7222655415534973, + "num_tokens": 102991026.0, + "step": 3984 + }, + { + "epoch": 0.43762354491544037, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.3602511882781982, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.692480206489563, + "num_tokens": 103017666.0, + "step": 3985 + }, + { + "epoch": 0.437733362618054, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.7075304985046387, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7179045677185059, + "num_tokens": 103038445.0, + "step": 3986 + }, + { + "epoch": 0.4378431803206677, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.3724558353424072, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6849896907806396, + "num_tokens": 103069019.0, + "step": 3987 + }, + { + "epoch": 0.43795299802328136, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.656590223312378, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7065433859825134, + "num_tokens": 103090281.0, + "step": 3988 + }, + { + "epoch": 0.438062815725895, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.196589708328247, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7293620109558105, + "num_tokens": 103118962.0, + "step": 3989 + }, + { + "epoch": 0.43817263342850865, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.339348316192627, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7157710194587708, + "num_tokens": 103145401.0, + "step": 3990 + }, + { + "epoch": 0.43828245113112235, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.288210153579712, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6970436573028564, + "num_tokens": 103172448.0, + "step": 3991 + }, + { + "epoch": 0.438392268833736, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.307755708694458, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7060542106628418, + "num_tokens": 103198006.0, + "step": 3992 + }, + { + "epoch": 0.43850208653634964, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.4152944087982178, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7050231099128723, + "num_tokens": 103222454.0, + "step": 3993 + }, + { + "epoch": 0.43861190423896335, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.3169357776641846, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7073206901550293, + "num_tokens": 103249089.0, + "step": 3994 + }, + { + "epoch": 0.438721721941577, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.5715417861938477, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7266899347305298, + "num_tokens": 103270197.0, + "step": 3995 + }, + { + "epoch": 0.43883153964419064, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.276578426361084, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.688646674156189, + "num_tokens": 103297499.0, + "step": 3996 + }, + { + "epoch": 0.4389413573468043, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.2902979850769043, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7134411334991455, + "num_tokens": 103324855.0, + "step": 3997 + }, + { + "epoch": 0.439051175049418, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.313586473464966, + "learning_rate": 1e-06, + "loss": 1.124, + "mean_token_accuracy": 0.6787512302398682, + "num_tokens": 103352855.0, + "step": 3998 + }, + { + "epoch": 0.43916099275203163, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.454563856124878, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6948162913322449, + "num_tokens": 103375974.0, + "step": 3999 + }, + { + "epoch": 0.4392708104546453, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.181025266647339, + "learning_rate": 1e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6651694774627686, + "num_tokens": 103407738.0, + "step": 4000 + }, + { + "epoch": 0.4393806281572589, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.2316596508026123, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6958991289138794, + "num_tokens": 103437027.0, + "step": 4001 + }, + { + "epoch": 0.4394904458598726, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.521913528442383, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.703694224357605, + "num_tokens": 103460046.0, + "step": 4002 + }, + { + "epoch": 0.43960026356248627, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.3674042224884033, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.702125608921051, + "num_tokens": 103485621.0, + "step": 4003 + }, + { + "epoch": 0.4397100812650999, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.0179736614227295, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6837095618247986, + "num_tokens": 103518984.0, + "step": 4004 + }, + { + "epoch": 0.4398198989677136, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.582332134246826, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6906071901321411, + "num_tokens": 103541595.0, + "step": 4005 + }, + { + "epoch": 0.43992971667032726, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.5298361778259277, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6864032745361328, + "num_tokens": 103565995.0, + "step": 4006 + }, + { + "epoch": 0.4400395343729409, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.540189743041992, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6867580413818359, + "num_tokens": 103589961.0, + "step": 4007 + }, + { + "epoch": 0.44014935207555456, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.1518890857696533, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6856138110160828, + "num_tokens": 103624015.0, + "step": 4008 + }, + { + "epoch": 0.44025916977816826, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.300414562225342, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7148483991622925, + "num_tokens": 103648981.0, + "step": 4009 + }, + { + "epoch": 0.4403689874807819, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.441239833831787, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7133947014808655, + "num_tokens": 103672992.0, + "step": 4010 + }, + { + "epoch": 0.44047880518339555, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.6036438941955566, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7087888121604919, + "num_tokens": 103693983.0, + "step": 4011 + }, + { + "epoch": 0.44058862288600925, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.453752040863037, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6863116025924683, + "num_tokens": 103717413.0, + "step": 4012 + }, + { + "epoch": 0.4406984405886229, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.6178295612335205, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7045403718948364, + "num_tokens": 103739793.0, + "step": 4013 + }, + { + "epoch": 0.44080825829123654, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.4118359088897705, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7115273475646973, + "num_tokens": 103763425.0, + "step": 4014 + }, + { + "epoch": 0.4409180759938502, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.557851552963257, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7255585789680481, + "num_tokens": 103784225.0, + "step": 4015 + }, + { + "epoch": 0.4410278936964639, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.2352914810180664, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7033559679985046, + "num_tokens": 103811751.0, + "step": 4016 + }, + { + "epoch": 0.44113771139907754, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.0705909729003906, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.689673900604248, + "num_tokens": 103843323.0, + "step": 4017 + }, + { + "epoch": 0.4412475291016912, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.3817245960235596, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.7066853046417236, + "num_tokens": 103869122.0, + "step": 4018 + }, + { + "epoch": 0.44135734680430483, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.4180667400360107, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6892349720001221, + "num_tokens": 103892813.0, + "step": 4019 + }, + { + "epoch": 0.44146716450691853, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.0278854370117188, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.689545750617981, + "num_tokens": 103925498.0, + "step": 4020 + }, + { + "epoch": 0.4415769822095322, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 2.457270622253418, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7123479843139648, + "num_tokens": 103948200.0, + "step": 4021 + }, + { + "epoch": 0.4416867999121458, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.456510066986084, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7294989824295044, + "num_tokens": 103969306.0, + "step": 4022 + }, + { + "epoch": 0.4417966176147595, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.480464458465576, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7092406749725342, + "num_tokens": 103992620.0, + "step": 4023 + }, + { + "epoch": 0.44190643531737317, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.325864315032959, + "learning_rate": 1e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6949790716171265, + "num_tokens": 104022069.0, + "step": 4024 + }, + { + "epoch": 0.4420162530199868, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.258570671081543, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6971991658210754, + "num_tokens": 104052384.0, + "step": 4025 + }, + { + "epoch": 0.44212607072260046, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3118185997009277, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6839823722839355, + "num_tokens": 104080332.0, + "step": 4026 + }, + { + "epoch": 0.44223588842521416, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3799402713775635, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6908449530601501, + "num_tokens": 104106260.0, + "step": 4027 + }, + { + "epoch": 0.4423457061278278, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.090592861175537, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6965606212615967, + "num_tokens": 104135474.0, + "step": 4028 + }, + { + "epoch": 0.44245552383044146, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5047173500061035, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7007656097412109, + "num_tokens": 104157966.0, + "step": 4029 + }, + { + "epoch": 0.4425653415330551, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5535454750061035, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6910339593887329, + "num_tokens": 104180901.0, + "step": 4030 + }, + { + "epoch": 0.4426751592356688, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.240931987762451, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7008743286132812, + "num_tokens": 104210427.0, + "step": 4031 + }, + { + "epoch": 0.44278497693828245, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.235670328140259, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7091786861419678, + "num_tokens": 104240781.0, + "step": 4032 + }, + { + "epoch": 0.4428947946408961, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3875370025634766, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6979175806045532, + "num_tokens": 104265444.0, + "step": 4033 + }, + { + "epoch": 0.4430046123435098, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.389878511428833, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7057475447654724, + "num_tokens": 104289851.0, + "step": 4034 + }, + { + "epoch": 0.44311443004612344, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.219958543777466, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7055747509002686, + "num_tokens": 104318477.0, + "step": 4035 + }, + { + "epoch": 0.4432242477487371, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3583297729492188, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6878492832183838, + "num_tokens": 104344467.0, + "step": 4036 + }, + { + "epoch": 0.44333406545135073, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.4296278953552246, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6867074966430664, + "num_tokens": 104370130.0, + "step": 4037 + }, + { + "epoch": 0.44344388315396444, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.399461269378662, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.721034049987793, + "num_tokens": 104393844.0, + "step": 4038 + }, + { + "epoch": 0.4435537008565781, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.721938133239746, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6956155896186829, + "num_tokens": 104413570.0, + "step": 4039 + }, + { + "epoch": 0.4436635185591917, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.2732250690460205, + "learning_rate": 1e-06, + "loss": 1.1363, + "mean_token_accuracy": 0.664862871170044, + "num_tokens": 104443227.0, + "step": 4040 + }, + { + "epoch": 0.44377333626180543, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.448983907699585, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7004772424697876, + "num_tokens": 104467348.0, + "step": 4041 + }, + { + "epoch": 0.4438831539644191, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.676929473876953, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7150839567184448, + "num_tokens": 104487199.0, + "step": 4042 + }, + { + "epoch": 0.4439929716670327, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5414958000183105, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7011194825172424, + "num_tokens": 104508389.0, + "step": 4043 + }, + { + "epoch": 0.44410278936964637, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.11285400390625, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7071006298065186, + "num_tokens": 104541411.0, + "step": 4044 + }, + { + "epoch": 0.44421260707226007, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.2756989002227783, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7107239961624146, + "num_tokens": 104568153.0, + "step": 4045 + }, + { + "epoch": 0.4443224247748737, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5260558128356934, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6906143426895142, + "num_tokens": 104591936.0, + "step": 4046 + }, + { + "epoch": 0.44443224247748736, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.2757394313812256, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.684414267539978, + "num_tokens": 104621841.0, + "step": 4047 + }, + { + "epoch": 0.444542060180101, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.246485471725464, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6977477073669434, + "num_tokens": 104651647.0, + "step": 4048 + }, + { + "epoch": 0.4446518778827147, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.4781301021575928, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6864809393882751, + "num_tokens": 104677572.0, + "step": 4049 + }, + { + "epoch": 0.44476169558532835, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.282715320587158, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7137941718101501, + "num_tokens": 104706092.0, + "step": 4050 + }, + { + "epoch": 0.444871513287942, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.390484571456909, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6836327314376831, + "num_tokens": 104732079.0, + "step": 4051 + }, + { + "epoch": 0.4449813309905557, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.570833683013916, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6844451427459717, + "num_tokens": 104760767.0, + "step": 4052 + }, + { + "epoch": 0.44509114869316935, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3462605476379395, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7139356136322021, + "num_tokens": 104788430.0, + "step": 4053 + }, + { + "epoch": 0.445200966395783, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.31351637840271, + "learning_rate": 1e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6872900724411011, + "num_tokens": 104817342.0, + "step": 4054 + }, + { + "epoch": 0.44531078409839664, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.2634410858154297, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6971649527549744, + "num_tokens": 104843385.0, + "step": 4055 + }, + { + "epoch": 0.44542060180101034, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5278491973876953, + "learning_rate": 1e-06, + "loss": 1.1028, + "mean_token_accuracy": 0.6754634976387024, + "num_tokens": 104868452.0, + "step": 4056 + }, + { + "epoch": 0.445530419503624, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.487236499786377, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.721792459487915, + "num_tokens": 104890186.0, + "step": 4057 + }, + { + "epoch": 0.44564023720623763, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.475461483001709, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6915291547775269, + "num_tokens": 104914247.0, + "step": 4058 + }, + { + "epoch": 0.44575005490885133, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.38643217086792, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.692590057849884, + "num_tokens": 104939335.0, + "step": 4059 + }, + { + "epoch": 0.445859872611465, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5503387451171875, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.709423303604126, + "num_tokens": 104962635.0, + "step": 4060 + }, + { + "epoch": 0.4459696903140786, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5409717559814453, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7233704328536987, + "num_tokens": 104983584.0, + "step": 4061 + }, + { + "epoch": 0.44607950801669227, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.464660882949829, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7200198173522949, + "num_tokens": 105004819.0, + "step": 4062 + }, + { + "epoch": 0.446189325719306, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.503330945968628, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7313313484191895, + "num_tokens": 105026420.0, + "step": 4063 + }, + { + "epoch": 0.4462991434219196, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5397982597351074, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7036821246147156, + "num_tokens": 105048663.0, + "step": 4064 + }, + { + "epoch": 0.44640896112453327, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.64237642288208, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7034220695495605, + "num_tokens": 105072117.0, + "step": 4065 + }, + { + "epoch": 0.4465187788271469, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.349457263946533, + "learning_rate": 1e-06, + "loss": 1.144, + "mean_token_accuracy": 0.6700564622879028, + "num_tokens": 105100997.0, + "step": 4066 + }, + { + "epoch": 0.4466285965297606, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.3478779792785645, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6882855296134949, + "num_tokens": 105126987.0, + "step": 4067 + }, + { + "epoch": 0.44673841423237426, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.39327073097229, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7091437578201294, + "num_tokens": 105151037.0, + "step": 4068 + }, + { + "epoch": 0.4468482319349879, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.270078182220459, + "learning_rate": 1e-06, + "loss": 1.0934, + "mean_token_accuracy": 0.6795597076416016, + "num_tokens": 105179052.0, + "step": 4069 + }, + { + "epoch": 0.4469580496376016, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.4163119792938232, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.709641695022583, + "num_tokens": 105202762.0, + "step": 4070 + }, + { + "epoch": 0.44706786734021525, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.2759764194488525, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.6796754598617554, + "num_tokens": 105231743.0, + "step": 4071 + }, + { + "epoch": 0.4471776850428289, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2639048099517822, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6752957105636597, + "num_tokens": 105258755.0, + "step": 4072 + }, + { + "epoch": 0.44728750274544254, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.518167018890381, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6903908252716064, + "num_tokens": 105283134.0, + "step": 4073 + }, + { + "epoch": 0.44739732044805625, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.8043224811553955, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7317247986793518, + "num_tokens": 105302217.0, + "step": 4074 + }, + { + "epoch": 0.4475071381506699, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.574559450149536, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6992217898368835, + "num_tokens": 105324669.0, + "step": 4075 + }, + { + "epoch": 0.44761695585328354, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.294604778289795, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.686909556388855, + "num_tokens": 105351150.0, + "step": 4076 + }, + { + "epoch": 0.4477267735558972, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.031569242477417, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6991696357727051, + "num_tokens": 105382344.0, + "step": 4077 + }, + { + "epoch": 0.4478365912585109, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.0984439849853516, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7123956084251404, + "num_tokens": 105412565.0, + "step": 4078 + }, + { + "epoch": 0.44794640896112453, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5980660915374756, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6967341303825378, + "num_tokens": 105434373.0, + "step": 4079 + }, + { + "epoch": 0.4480562266637382, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 2.5279839038848877, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6919991374015808, + "num_tokens": 105457771.0, + "step": 4080 + }, + { + "epoch": 0.4481660443663519, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1656088829040527, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7169885039329529, + "num_tokens": 105483652.0, + "step": 4081 + }, + { + "epoch": 0.4482758620689655, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4689626693725586, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7184942960739136, + "num_tokens": 105506506.0, + "step": 4082 + }, + { + "epoch": 0.44838567977157917, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.183476209640503, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7139314413070679, + "num_tokens": 105535039.0, + "step": 4083 + }, + { + "epoch": 0.4484954974741928, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2355265617370605, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6964176297187805, + "num_tokens": 105561456.0, + "step": 4084 + }, + { + "epoch": 0.4486053151768065, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3480770587921143, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6814365386962891, + "num_tokens": 105587627.0, + "step": 4085 + }, + { + "epoch": 0.44871513287942016, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4553661346435547, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.706200361251831, + "num_tokens": 105613040.0, + "step": 4086 + }, + { + "epoch": 0.4488249505820338, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.439943790435791, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6994761824607849, + "num_tokens": 105638821.0, + "step": 4087 + }, + { + "epoch": 0.4489347682846475, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4757258892059326, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6985658407211304, + "num_tokens": 105661894.0, + "step": 4088 + }, + { + "epoch": 0.44904458598726116, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.0140132904052734, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6893707513809204, + "num_tokens": 105697736.0, + "step": 4089 + }, + { + "epoch": 0.4491544036898748, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.5475339889526367, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7243189811706543, + "num_tokens": 105717630.0, + "step": 4090 + }, + { + "epoch": 0.44926422139248845, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.5820131301879883, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.705801248550415, + "num_tokens": 105739780.0, + "step": 4091 + }, + { + "epoch": 0.44937403909510215, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 7.0603532791137695, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.715326189994812, + "num_tokens": 105767513.0, + "step": 4092 + }, + { + "epoch": 0.4494838567977158, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3708205223083496, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.698267936706543, + "num_tokens": 105793045.0, + "step": 4093 + }, + { + "epoch": 0.44959367450032944, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3092105388641357, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6921496391296387, + "num_tokens": 105818510.0, + "step": 4094 + }, + { + "epoch": 0.4497034922029431, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.7389075756073, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7061720490455627, + "num_tokens": 105836643.0, + "step": 4095 + }, + { + "epoch": 0.4498133099055568, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1762170791625977, + "learning_rate": 1e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.6826117038726807, + "num_tokens": 105865818.0, + "step": 4096 + }, + { + "epoch": 0.44992312760817044, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3244946002960205, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.71058189868927, + "num_tokens": 105891557.0, + "step": 4097 + }, + { + "epoch": 0.4500329453107841, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.524768590927124, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6889941692352295, + "num_tokens": 105914959.0, + "step": 4098 + }, + { + "epoch": 0.4501427630133978, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3571910858154297, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6959750056266785, + "num_tokens": 105940788.0, + "step": 4099 + }, + { + "epoch": 0.45025258071601143, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.8419229984283447, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7256542444229126, + "num_tokens": 105960218.0, + "step": 4100 + }, + { + "epoch": 0.4503623984186251, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.409802198410034, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7069761753082275, + "num_tokens": 105985451.0, + "step": 4101 + }, + { + "epoch": 0.4504722161212387, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.151559352874756, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7082595229148865, + "num_tokens": 106015411.0, + "step": 4102 + }, + { + "epoch": 0.4505820338238524, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3769304752349854, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6932287216186523, + "num_tokens": 106040921.0, + "step": 4103 + }, + { + "epoch": 0.45069185152646607, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4054620265960693, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7047166228294373, + "num_tokens": 106065656.0, + "step": 4104 + }, + { + "epoch": 0.4508016692290797, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1476244926452637, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7068333625793457, + "num_tokens": 106097280.0, + "step": 4105 + }, + { + "epoch": 0.45091148693169336, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.211634874343872, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7279432415962219, + "num_tokens": 106123047.0, + "step": 4106 + }, + { + "epoch": 0.45102130463430706, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.206112861633301, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7114264965057373, + "num_tokens": 106151119.0, + "step": 4107 + }, + { + "epoch": 0.4511311223369207, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.098780870437622, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.681591272354126, + "num_tokens": 106183173.0, + "step": 4108 + }, + { + "epoch": 0.45124094003953436, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4760489463806152, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7066556215286255, + "num_tokens": 106206388.0, + "step": 4109 + }, + { + "epoch": 0.45135075774214806, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2510437965393066, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.7078491449356079, + "num_tokens": 106235476.0, + "step": 4110 + }, + { + "epoch": 0.4514605754447617, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 7.104694843292236, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7070975303649902, + "num_tokens": 106261962.0, + "step": 4111 + }, + { + "epoch": 0.45157039314737535, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.462491273880005, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6813173294067383, + "num_tokens": 106288609.0, + "step": 4112 + }, + { + "epoch": 0.451680210849989, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.190455675125122, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7116292715072632, + "num_tokens": 106316201.0, + "step": 4113 + }, + { + "epoch": 0.4517900285526027, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.0406908988952637, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.68576979637146, + "num_tokens": 106349835.0, + "step": 4114 + }, + { + "epoch": 0.45189984625521634, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2637991905212402, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6885466575622559, + "num_tokens": 106376203.0, + "step": 4115 + }, + { + "epoch": 0.45200966395783, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.308619737625122, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6999587416648865, + "num_tokens": 106404493.0, + "step": 4116 + }, + { + "epoch": 0.4521194816604437, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1673853397369385, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6994408369064331, + "num_tokens": 106432988.0, + "step": 4117 + }, + { + "epoch": 0.45222929936305734, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.6908442974090576, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6933314800262451, + "num_tokens": 106453120.0, + "step": 4118 + }, + { + "epoch": 0.452339117065671, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4209482669830322, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7134901285171509, + "num_tokens": 106475765.0, + "step": 4119 + }, + { + "epoch": 0.4524489347682846, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3457283973693848, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7091771364212036, + "num_tokens": 106501456.0, + "step": 4120 + }, + { + "epoch": 0.45255875247089833, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1499104499816895, + "learning_rate": 1e-06, + "loss": 1.1219, + "mean_token_accuracy": 0.6669524312019348, + "num_tokens": 106534135.0, + "step": 4121 + }, + { + "epoch": 0.452668570173512, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4103596210479736, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7145419120788574, + "num_tokens": 106558166.0, + "step": 4122 + }, + { + "epoch": 0.4527783878761256, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.245389938354492, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.686526894569397, + "num_tokens": 106585004.0, + "step": 4123 + }, + { + "epoch": 0.45288820557873927, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.354797601699829, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7262297868728638, + "num_tokens": 106609383.0, + "step": 4124 + }, + { + "epoch": 0.45299802328135297, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1972029209136963, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.695549726486206, + "num_tokens": 106641061.0, + "step": 4125 + }, + { + "epoch": 0.4531078409839666, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.203598976135254, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7082923650741577, + "num_tokens": 106672179.0, + "step": 4126 + }, + { + "epoch": 0.45321765868658026, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.524230718612671, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6923074722290039, + "num_tokens": 106695512.0, + "step": 4127 + }, + { + "epoch": 0.45332747638919396, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.0700223445892334, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.726362407207489, + "num_tokens": 106723126.0, + "step": 4128 + }, + { + "epoch": 0.4534372940918076, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4111382961273193, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7045379877090454, + "num_tokens": 106747737.0, + "step": 4129 + }, + { + "epoch": 0.45354711179442125, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1616153717041016, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6905456781387329, + "num_tokens": 106778830.0, + "step": 4130 + }, + { + "epoch": 0.4536569294970349, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.380559206008911, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6897486448287964, + "num_tokens": 106803825.0, + "step": 4131 + }, + { + "epoch": 0.4537667471996486, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.120063066482544, + "learning_rate": 1e-06, + "loss": 1.1254, + "mean_token_accuracy": 0.6752382516860962, + "num_tokens": 106836896.0, + "step": 4132 + }, + { + "epoch": 0.45387656490226225, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.323758363723755, + "learning_rate": 1e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.6746299862861633, + "num_tokens": 106862131.0, + "step": 4133 + }, + { + "epoch": 0.4539863826048759, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.299072027206421, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.7011059522628784, + "num_tokens": 106888132.0, + "step": 4134 + }, + { + "epoch": 0.4540962003074896, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.226402997970581, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7108559012413025, + "num_tokens": 106916637.0, + "step": 4135 + }, + { + "epoch": 0.45420601801010324, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.29845929145813, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6973132491111755, + "num_tokens": 106943004.0, + "step": 4136 + }, + { + "epoch": 0.4543158357127169, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.258600950241089, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6943989992141724, + "num_tokens": 106972634.0, + "step": 4137 + }, + { + "epoch": 0.45442565341533053, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.518949508666992, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6864039301872253, + "num_tokens": 106995022.0, + "step": 4138 + }, + { + "epoch": 0.45453547111794423, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.65207839012146, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7180918455123901, + "num_tokens": 107014998.0, + "step": 4139 + }, + { + "epoch": 0.4546452888205579, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.0797207355499268, + "learning_rate": 1e-06, + "loss": 1.0954, + "mean_token_accuracy": 0.675524115562439, + "num_tokens": 107047636.0, + "step": 4140 + }, + { + "epoch": 0.4547551065231715, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.5726711750030518, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7049980163574219, + "num_tokens": 107073075.0, + "step": 4141 + }, + { + "epoch": 0.45486492422578517, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.6813318729400635, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.707923173904419, + "num_tokens": 107095074.0, + "step": 4142 + }, + { + "epoch": 0.4549747419283989, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3646087646484375, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6958438158035278, + "num_tokens": 107120478.0, + "step": 4143 + }, + { + "epoch": 0.4550845596310125, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2334773540496826, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7002041339874268, + "num_tokens": 107149935.0, + "step": 4144 + }, + { + "epoch": 0.45519437733362617, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.390321969985962, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7001486420631409, + "num_tokens": 107176438.0, + "step": 4145 + }, + { + "epoch": 0.45530419503623987, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.202151298522949, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7042253613471985, + "num_tokens": 107202811.0, + "step": 4146 + }, + { + "epoch": 0.4554140127388535, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2367076873779297, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6906589865684509, + "num_tokens": 107233423.0, + "step": 4147 + }, + { + "epoch": 0.45552383044146716, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2851219177246094, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6996191740036011, + "num_tokens": 107259979.0, + "step": 4148 + }, + { + "epoch": 0.4556336481440808, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.20212721824646, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6845742464065552, + "num_tokens": 107289683.0, + "step": 4149 + }, + { + "epoch": 0.4557434658466945, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4042539596557617, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7035808563232422, + "num_tokens": 107315025.0, + "step": 4150 + }, + { + "epoch": 0.45585328354930815, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.567348003387451, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6997014880180359, + "num_tokens": 107337420.0, + "step": 4151 + }, + { + "epoch": 0.4559631012519218, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.529745101928711, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6964385509490967, + "num_tokens": 107360034.0, + "step": 4152 + }, + { + "epoch": 0.45607291895453544, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2972617149353027, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6844751238822937, + "num_tokens": 107389164.0, + "step": 4153 + }, + { + "epoch": 0.45618273665714915, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.455930233001709, + "learning_rate": 1e-06, + "loss": 1.1394, + "mean_token_accuracy": 0.6722207069396973, + "num_tokens": 107417239.0, + "step": 4154 + }, + { + "epoch": 0.4562925543597628, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.446291923522949, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6927312612533569, + "num_tokens": 107442241.0, + "step": 4155 + }, + { + "epoch": 0.45640237206237644, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.415207862854004, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7017310857772827, + "num_tokens": 107464525.0, + "step": 4156 + }, + { + "epoch": 0.45651218976499014, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.5374879837036133, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7163554430007935, + "num_tokens": 107487863.0, + "step": 4157 + }, + { + "epoch": 0.4566220074676038, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.270637273788452, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6950045228004456, + "num_tokens": 107516463.0, + "step": 4158 + }, + { + "epoch": 0.45673182517021743, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.154686689376831, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6813591122627258, + "num_tokens": 107547372.0, + "step": 4159 + }, + { + "epoch": 0.4568416428728311, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.500323534011841, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7047140598297119, + "num_tokens": 107570039.0, + "step": 4160 + }, + { + "epoch": 0.4569514605754448, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.451202392578125, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6904617547988892, + "num_tokens": 107595999.0, + "step": 4161 + }, + { + "epoch": 0.4570612782780584, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2520852088928223, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.696281373500824, + "num_tokens": 107624303.0, + "step": 4162 + }, + { + "epoch": 0.45717109598067207, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2791261672973633, + "learning_rate": 1e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6849744319915771, + "num_tokens": 107654015.0, + "step": 4163 + }, + { + "epoch": 0.4572809136832858, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.2015011310577393, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7309067845344543, + "num_tokens": 107681066.0, + "step": 4164 + }, + { + "epoch": 0.4573907313858994, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4538819789886475, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7147983312606812, + "num_tokens": 107704738.0, + "step": 4165 + }, + { + "epoch": 0.45750054908851306, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1430797576904297, + "learning_rate": 1e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6754758358001709, + "num_tokens": 107738308.0, + "step": 4166 + }, + { + "epoch": 0.4576103667911267, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.1863460540771484, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6828229427337646, + "num_tokens": 107770284.0, + "step": 4167 + }, + { + "epoch": 0.4577201844937404, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.266366481781006, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6871955394744873, + "num_tokens": 107800283.0, + "step": 4168 + }, + { + "epoch": 0.45783000219635406, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.418534517288208, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7175233364105225, + "num_tokens": 107824988.0, + "step": 4169 + }, + { + "epoch": 0.4579398198989677, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.4682090282440186, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7209197878837585, + "num_tokens": 107847507.0, + "step": 4170 + }, + { + "epoch": 0.45804963760158135, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.197354793548584, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6919865608215332, + "num_tokens": 107878874.0, + "step": 4171 + }, + { + "epoch": 0.45815945530419505, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.221543312072754, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6896942853927612, + "num_tokens": 107907987.0, + "step": 4172 + }, + { + "epoch": 0.4582692730068087, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.143765449523926, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6917246580123901, + "num_tokens": 107935921.0, + "step": 4173 + }, + { + "epoch": 0.45837909070942234, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.069563627243042, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7001816630363464, + "num_tokens": 107969167.0, + "step": 4174 + }, + { + "epoch": 0.45848890841203604, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 2.3632652759552, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7104636430740356, + "num_tokens": 107993529.0, + "step": 4175 + }, + { + "epoch": 0.4585987261146497, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4186768531799316, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6826379299163818, + "num_tokens": 108019805.0, + "step": 4176 + }, + { + "epoch": 0.45870854381726334, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1410481929779053, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6890462040901184, + "num_tokens": 108048592.0, + "step": 4177 + }, + { + "epoch": 0.458818361519877, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.109056234359741, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.713973879814148, + "num_tokens": 108079410.0, + "step": 4178 + }, + { + "epoch": 0.4589281792224907, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.330226182937622, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7101107835769653, + "num_tokens": 108106842.0, + "step": 4179 + }, + { + "epoch": 0.45903799692510433, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.30208420753479, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6979562044143677, + "num_tokens": 108134819.0, + "step": 4180 + }, + { + "epoch": 0.459147814627718, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4059066772460938, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7098268270492554, + "num_tokens": 108160211.0, + "step": 4181 + }, + { + "epoch": 0.4592576323303316, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.410741090774536, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7043453454971313, + "num_tokens": 108184231.0, + "step": 4182 + }, + { + "epoch": 0.4593674500329453, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.436060667037964, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6982735395431519, + "num_tokens": 108206916.0, + "step": 4183 + }, + { + "epoch": 0.45947726773555897, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4862020015716553, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7121570110321045, + "num_tokens": 108227851.0, + "step": 4184 + }, + { + "epoch": 0.4595870854381726, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.397477865219116, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6932562589645386, + "num_tokens": 108254622.0, + "step": 4185 + }, + { + "epoch": 0.4596969031407863, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.246901273727417, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6973634362220764, + "num_tokens": 108282366.0, + "step": 4186 + }, + { + "epoch": 0.45980672084339996, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1641218662261963, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6876129508018494, + "num_tokens": 108314359.0, + "step": 4187 + }, + { + "epoch": 0.4599165385460136, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1357758045196533, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6987518668174744, + "num_tokens": 108342960.0, + "step": 4188 + }, + { + "epoch": 0.46002635624862726, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2163877487182617, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6881228685379028, + "num_tokens": 108369224.0, + "step": 4189 + }, + { + "epoch": 0.46013617395124096, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2585031986236572, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6907115578651428, + "num_tokens": 108397077.0, + "step": 4190 + }, + { + "epoch": 0.4602459916538546, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.32607102394104, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6935811638832092, + "num_tokens": 108421690.0, + "step": 4191 + }, + { + "epoch": 0.46035580935646825, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.294476270675659, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6762712001800537, + "num_tokens": 108449768.0, + "step": 4192 + }, + { + "epoch": 0.46046562705908195, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 3.0601346492767334, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7052988409996033, + "num_tokens": 108466772.0, + "step": 4193 + }, + { + "epoch": 0.4605754447616956, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.5089194774627686, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7203969359397888, + "num_tokens": 108488184.0, + "step": 4194 + }, + { + "epoch": 0.46068526246430924, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2741706371307373, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6981584429740906, + "num_tokens": 108514984.0, + "step": 4195 + }, + { + "epoch": 0.4607950801669229, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2634661197662354, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7098795175552368, + "num_tokens": 108540998.0, + "step": 4196 + }, + { + "epoch": 0.4609048978695366, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1855270862579346, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7051751613616943, + "num_tokens": 108570211.0, + "step": 4197 + }, + { + "epoch": 0.46101471557215024, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.216676950454712, + "learning_rate": 1e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.6763175129890442, + "num_tokens": 108599725.0, + "step": 4198 + }, + { + "epoch": 0.4611245332747639, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.439403772354126, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6901040077209473, + "num_tokens": 108624771.0, + "step": 4199 + }, + { + "epoch": 0.4612343509773775, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.308443069458008, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6888602375984192, + "num_tokens": 108655965.0, + "step": 4200 + }, + { + "epoch": 0.46134416867999123, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4385313987731934, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6994145512580872, + "num_tokens": 108678648.0, + "step": 4201 + }, + { + "epoch": 0.4614539863826049, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.249976873397827, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6908202171325684, + "num_tokens": 108708088.0, + "step": 4202 + }, + { + "epoch": 0.4615638040852185, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1693437099456787, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7101908922195435, + "num_tokens": 108737497.0, + "step": 4203 + }, + { + "epoch": 0.4616736217878322, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.3219265937805176, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7070091962814331, + "num_tokens": 108764609.0, + "step": 4204 + }, + { + "epoch": 0.46178343949044587, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1909067630767822, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7010621428489685, + "num_tokens": 108790774.0, + "step": 4205 + }, + { + "epoch": 0.4618932571930595, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1197876930236816, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7161038517951965, + "num_tokens": 108820188.0, + "step": 4206 + }, + { + "epoch": 0.46200307489567316, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.5749759674072266, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7217806577682495, + "num_tokens": 108841518.0, + "step": 4207 + }, + { + "epoch": 0.46211289259828686, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.5416877269744873, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6921776533126831, + "num_tokens": 108865351.0, + "step": 4208 + }, + { + "epoch": 0.4622227103009005, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.599379539489746, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6879144906997681, + "num_tokens": 108887902.0, + "step": 4209 + }, + { + "epoch": 0.46233252800351415, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1314821243286133, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7046797871589661, + "num_tokens": 108917553.0, + "step": 4210 + }, + { + "epoch": 0.46244234570612786, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.157811164855957, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6969770193099976, + "num_tokens": 108949999.0, + "step": 4211 + }, + { + "epoch": 0.4625521634087415, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.3190557956695557, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.7037972211837769, + "num_tokens": 108976054.0, + "step": 4212 + }, + { + "epoch": 0.46266198111135515, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.41426157951355, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7138071060180664, + "num_tokens": 109001853.0, + "step": 4213 + }, + { + "epoch": 0.4627717988139688, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.8675971031188965, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7319390177726746, + "num_tokens": 109018515.0, + "step": 4214 + }, + { + "epoch": 0.4628816165165825, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.440638303756714, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7058786749839783, + "num_tokens": 109043524.0, + "step": 4215 + }, + { + "epoch": 0.46299143421919614, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.5359532833099365, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.693226158618927, + "num_tokens": 109065870.0, + "step": 4216 + }, + { + "epoch": 0.4631012519218098, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2271270751953125, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6883319616317749, + "num_tokens": 109092819.0, + "step": 4217 + }, + { + "epoch": 0.46321106962442343, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.3585219383239746, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7158973217010498, + "num_tokens": 109119366.0, + "step": 4218 + }, + { + "epoch": 0.46332088732703713, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.6775693893432617, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6953597068786621, + "num_tokens": 109142027.0, + "step": 4219 + }, + { + "epoch": 0.4634307050296508, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.5305354595184326, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7013746500015259, + "num_tokens": 109164566.0, + "step": 4220 + }, + { + "epoch": 0.4635405227322644, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.6202993392944336, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7226102948188782, + "num_tokens": 109185471.0, + "step": 4221 + }, + { + "epoch": 0.4636503404348781, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.47176456451416, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7004565596580505, + "num_tokens": 109209593.0, + "step": 4222 + }, + { + "epoch": 0.4637601581374918, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4908294677734375, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6844384074211121, + "num_tokens": 109233085.0, + "step": 4223 + }, + { + "epoch": 0.4638699758401054, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.516456127166748, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7085773348808289, + "num_tokens": 109255439.0, + "step": 4224 + }, + { + "epoch": 0.46397979354271907, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.376154899597168, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6900864839553833, + "num_tokens": 109285240.0, + "step": 4225 + }, + { + "epoch": 0.46408961124533277, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.6623330116271973, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7168674468994141, + "num_tokens": 109306534.0, + "step": 4226 + }, + { + "epoch": 0.4641994289479464, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.2833595275878906, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6900684237480164, + "num_tokens": 109335144.0, + "step": 4227 + }, + { + "epoch": 0.46430924665056006, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1827032566070557, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6914910078048706, + "num_tokens": 109363223.0, + "step": 4228 + }, + { + "epoch": 0.4644190643531737, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.4637796878814697, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7073698043823242, + "num_tokens": 109385976.0, + "step": 4229 + }, + { + "epoch": 0.4645288820557874, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.424931287765503, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6915914416313171, + "num_tokens": 109413991.0, + "step": 4230 + }, + { + "epoch": 0.46463869975840105, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.469822883605957, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7192056775093079, + "num_tokens": 109434389.0, + "step": 4231 + }, + { + "epoch": 0.4647485174610147, + "ewc_loss": 1.0907649993896484e-05, + "grad_norm": 2.1517767906188965, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.689070463180542, + "num_tokens": 109466391.0, + "step": 4232 + }, + { + "epoch": 0.4648583351636284, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.577174425125122, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7076741456985474, + "num_tokens": 109489623.0, + "step": 4233 + }, + { + "epoch": 0.46496815286624205, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.346087694168091, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6890665888786316, + "num_tokens": 109515780.0, + "step": 4234 + }, + { + "epoch": 0.4650779705688557, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4129581451416016, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7050291895866394, + "num_tokens": 109539212.0, + "step": 4235 + }, + { + "epoch": 0.46518778827146934, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6342809200286865, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6868414878845215, + "num_tokens": 109560287.0, + "step": 4236 + }, + { + "epoch": 0.46529760597408304, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2585089206695557, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7039762735366821, + "num_tokens": 109588456.0, + "step": 4237 + }, + { + "epoch": 0.4654074236766967, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2790896892547607, + "learning_rate": 1e-06, + "loss": 1.0997, + "mean_token_accuracy": 0.6805387735366821, + "num_tokens": 109616581.0, + "step": 4238 + }, + { + "epoch": 0.46551724137931033, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4514176845550537, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7165395617485046, + "num_tokens": 109641331.0, + "step": 4239 + }, + { + "epoch": 0.46562705908192403, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1954808235168457, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6885851621627808, + "num_tokens": 109670886.0, + "step": 4240 + }, + { + "epoch": 0.4657368767845377, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2981464862823486, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6914945244789124, + "num_tokens": 109697801.0, + "step": 4241 + }, + { + "epoch": 0.4658466944871513, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2522377967834473, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7024366855621338, + "num_tokens": 109724159.0, + "step": 4242 + }, + { + "epoch": 0.46595651218976497, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.460501194000244, + "learning_rate": 1e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6701067686080933, + "num_tokens": 109749383.0, + "step": 4243 + }, + { + "epoch": 0.4660663298923787, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.340224027633667, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.70762038230896, + "num_tokens": 109776098.0, + "step": 4244 + }, + { + "epoch": 0.4661761475949923, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4865124225616455, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7092028856277466, + "num_tokens": 109797559.0, + "step": 4245 + }, + { + "epoch": 0.46628596529760596, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1024625301361084, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7279402613639832, + "num_tokens": 109828678.0, + "step": 4246 + }, + { + "epoch": 0.4663957830002196, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2464993000030518, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6812000870704651, + "num_tokens": 109857652.0, + "step": 4247 + }, + { + "epoch": 0.4665056007028333, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.235135793685913, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6970059275627136, + "num_tokens": 109884554.0, + "step": 4248 + }, + { + "epoch": 0.46661541840544696, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2617435455322266, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7158483266830444, + "num_tokens": 109911682.0, + "step": 4249 + }, + { + "epoch": 0.4667252361080606, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.485651969909668, + "learning_rate": 1e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6843247413635254, + "num_tokens": 109936157.0, + "step": 4250 + }, + { + "epoch": 0.4668350538106743, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1915340423583984, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6942864656448364, + "num_tokens": 109963461.0, + "step": 4251 + }, + { + "epoch": 0.46694487151328795, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.474407434463501, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7252222299575806, + "num_tokens": 109985239.0, + "step": 4252 + }, + { + "epoch": 0.4670546892159016, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3647379875183105, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6880106329917908, + "num_tokens": 110009774.0, + "step": 4253 + }, + { + "epoch": 0.46716450691851524, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6747353076934814, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6989932060241699, + "num_tokens": 110030532.0, + "step": 4254 + }, + { + "epoch": 0.46727432462112894, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.099911689758301, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6857600212097168, + "num_tokens": 110064065.0, + "step": 4255 + }, + { + "epoch": 0.4673841423237426, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.331817150115967, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7012502551078796, + "num_tokens": 110089461.0, + "step": 4256 + }, + { + "epoch": 0.46749396002635624, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.991091251373291, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6954381465911865, + "num_tokens": 110124072.0, + "step": 4257 + }, + { + "epoch": 0.4676037777289699, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.659116744995117, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7030814290046692, + "num_tokens": 110145663.0, + "step": 4258 + }, + { + "epoch": 0.4677135954315836, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.212155342102051, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6856805086135864, + "num_tokens": 110175157.0, + "step": 4259 + }, + { + "epoch": 0.46782341313419723, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5007498264312744, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7057482004165649, + "num_tokens": 110198393.0, + "step": 4260 + }, + { + "epoch": 0.4679332308368109, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.036557912826538, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6933905482292175, + "num_tokens": 110231770.0, + "step": 4261 + }, + { + "epoch": 0.4680430485394246, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.226999044418335, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6821491718292236, + "num_tokens": 110261755.0, + "step": 4262 + }, + { + "epoch": 0.4681528662420382, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6657159328460693, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.68486487865448, + "num_tokens": 110281687.0, + "step": 4263 + }, + { + "epoch": 0.46826268394465187, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4800946712493896, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7127086520195007, + "num_tokens": 110305069.0, + "step": 4264 + }, + { + "epoch": 0.4683725016472655, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5892961025238037, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7356343269348145, + "num_tokens": 110324935.0, + "step": 4265 + }, + { + "epoch": 0.4684823193498792, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6430306434631348, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6925787925720215, + "num_tokens": 110346955.0, + "step": 4266 + }, + { + "epoch": 0.46859213705249286, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1978859901428223, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7127393484115601, + "num_tokens": 110373544.0, + "step": 4267 + }, + { + "epoch": 0.4687019547551065, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2836928367614746, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.682889997959137, + "num_tokens": 110405117.0, + "step": 4268 + }, + { + "epoch": 0.4688117724577202, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4549102783203125, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6948279142379761, + "num_tokens": 110426767.0, + "step": 4269 + }, + { + "epoch": 0.46892159016033386, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.417722702026367, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6879884600639343, + "num_tokens": 110450394.0, + "step": 4270 + }, + { + "epoch": 0.4690314078629475, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.274963140487671, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6836342811584473, + "num_tokens": 110479240.0, + "step": 4271 + }, + { + "epoch": 0.46914122556556115, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3615763187408447, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.707947850227356, + "num_tokens": 110503201.0, + "step": 4272 + }, + { + "epoch": 0.46925104326817485, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2218434810638428, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.686326265335083, + "num_tokens": 110533873.0, + "step": 4273 + }, + { + "epoch": 0.4693608609707885, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.8199193477630615, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.71095210313797, + "num_tokens": 110553795.0, + "step": 4274 + }, + { + "epoch": 0.46947067867340214, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.376103162765503, + "learning_rate": 1e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.6838324069976807, + "num_tokens": 110582884.0, + "step": 4275 + }, + { + "epoch": 0.4695804963760158, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3402552604675293, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7048261761665344, + "num_tokens": 110608619.0, + "step": 4276 + }, + { + "epoch": 0.4696903140786295, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4161579608917236, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7065206170082092, + "num_tokens": 110633119.0, + "step": 4277 + }, + { + "epoch": 0.46980013178124314, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.344616174697876, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6847378015518188, + "num_tokens": 110658497.0, + "step": 4278 + }, + { + "epoch": 0.4699099494838568, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4037508964538574, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7103899717330933, + "num_tokens": 110682326.0, + "step": 4279 + }, + { + "epoch": 0.4700197671864705, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2823989391326904, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6874847412109375, + "num_tokens": 110712977.0, + "step": 4280 + }, + { + "epoch": 0.47012958488908413, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5059053897857666, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6931091547012329, + "num_tokens": 110735079.0, + "step": 4281 + }, + { + "epoch": 0.4702394025916978, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2312610149383545, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6888772249221802, + "num_tokens": 110763270.0, + "step": 4282 + }, + { + "epoch": 0.4703492202943114, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3591036796569824, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6912891864776611, + "num_tokens": 110789102.0, + "step": 4283 + }, + { + "epoch": 0.4704590379969251, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.0756149291992188, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7169017791748047, + "num_tokens": 110820768.0, + "step": 4284 + }, + { + "epoch": 0.47056885569953877, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1698110103607178, + "learning_rate": 1e-06, + "loss": 1.1168, + "mean_token_accuracy": 0.6719013452529907, + "num_tokens": 110852042.0, + "step": 4285 + }, + { + "epoch": 0.4706786734021524, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3545289039611816, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7147935032844543, + "num_tokens": 110877114.0, + "step": 4286 + }, + { + "epoch": 0.4707884911047661, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.130852222442627, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7393544912338257, + "num_tokens": 110905762.0, + "step": 4287 + }, + { + "epoch": 0.47089830880737976, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5124595165252686, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.694920003414154, + "num_tokens": 110926649.0, + "step": 4288 + }, + { + "epoch": 0.4710081265099934, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.349172353744507, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6904869079589844, + "num_tokens": 110952555.0, + "step": 4289 + }, + { + "epoch": 0.47111794421260705, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3699209690093994, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6878234148025513, + "num_tokens": 110978068.0, + "step": 4290 + }, + { + "epoch": 0.47122776191522076, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5954294204711914, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7108888626098633, + "num_tokens": 111000492.0, + "step": 4291 + }, + { + "epoch": 0.4713375796178344, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1738126277923584, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6861000657081604, + "num_tokens": 111031966.0, + "step": 4292 + }, + { + "epoch": 0.47144739732044805, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3237056732177734, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6824424266815186, + "num_tokens": 111060055.0, + "step": 4293 + }, + { + "epoch": 0.4715572150230617, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4075326919555664, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6888859272003174, + "num_tokens": 111088646.0, + "step": 4294 + }, + { + "epoch": 0.4716670327256754, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.379171133041382, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7037976980209351, + "num_tokens": 111113189.0, + "step": 4295 + }, + { + "epoch": 0.47177685042828904, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3660833835601807, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.703237771987915, + "num_tokens": 111136436.0, + "step": 4296 + }, + { + "epoch": 0.4718866681309027, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3179988861083984, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7096628546714783, + "num_tokens": 111162183.0, + "step": 4297 + }, + { + "epoch": 0.4719964858335164, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2038350105285645, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.6851528286933899, + "num_tokens": 111191417.0, + "step": 4298 + }, + { + "epoch": 0.47210630353613003, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.253591537475586, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7022945880889893, + "num_tokens": 111219309.0, + "step": 4299 + }, + { + "epoch": 0.4722161212387437, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.289842128753662, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7147969603538513, + "num_tokens": 111247213.0, + "step": 4300 + }, + { + "epoch": 0.4723259389413573, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6381635665893555, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6855443716049194, + "num_tokens": 111270004.0, + "step": 4301 + }, + { + "epoch": 0.472435756643971, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.41441011428833, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7091552019119263, + "num_tokens": 111294602.0, + "step": 4302 + }, + { + "epoch": 0.4725455743465847, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3975987434387207, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6872203350067139, + "num_tokens": 111321839.0, + "step": 4303 + }, + { + "epoch": 0.4726553920491983, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3802568912506104, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6865301132202148, + "num_tokens": 111348923.0, + "step": 4304 + }, + { + "epoch": 0.47276520975181197, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.076005220413208, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6898099184036255, + "num_tokens": 111378979.0, + "step": 4305 + }, + { + "epoch": 0.47287502745442567, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5305335521698, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6962889432907104, + "num_tokens": 111402072.0, + "step": 4306 + }, + { + "epoch": 0.4729848451570393, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4648685455322266, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7068978548049927, + "num_tokens": 111424366.0, + "step": 4307 + }, + { + "epoch": 0.47309466285965296, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.081993579864502, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6785515546798706, + "num_tokens": 111457727.0, + "step": 4308 + }, + { + "epoch": 0.47320448056226666, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.461717367172241, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7131333947181702, + "num_tokens": 111479473.0, + "step": 4309 + }, + { + "epoch": 0.4733142982648803, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3299975395202637, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6842225790023804, + "num_tokens": 111505459.0, + "step": 4310 + }, + { + "epoch": 0.47342411596749395, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.312511444091797, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6928468942642212, + "num_tokens": 111529820.0, + "step": 4311 + }, + { + "epoch": 0.4735339336701076, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4173150062561035, + "learning_rate": 1e-06, + "loss": 1.0726, + "mean_token_accuracy": 0.6960488557815552, + "num_tokens": 111556109.0, + "step": 4312 + }, + { + "epoch": 0.4736437513727213, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.9838948249816895, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7070807218551636, + "num_tokens": 111580721.0, + "step": 4313 + }, + { + "epoch": 0.47375356907533495, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4493887424468994, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7130119204521179, + "num_tokens": 111604459.0, + "step": 4314 + }, + { + "epoch": 0.4738633867779486, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.561824083328247, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6993066072463989, + "num_tokens": 111629930.0, + "step": 4315 + }, + { + "epoch": 0.4739732044805623, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2090036869049072, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6818881630897522, + "num_tokens": 111657613.0, + "step": 4316 + }, + { + "epoch": 0.47408302218317594, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6859817504882812, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7253420948982239, + "num_tokens": 111677344.0, + "step": 4317 + }, + { + "epoch": 0.4741928398857896, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5041801929473877, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7128505706787109, + "num_tokens": 111701017.0, + "step": 4318 + }, + { + "epoch": 0.47430265758840323, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.602695941925049, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7175522446632385, + "num_tokens": 111721446.0, + "step": 4319 + }, + { + "epoch": 0.47441247529101693, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.277529716491699, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6831953525543213, + "num_tokens": 111749768.0, + "step": 4320 + }, + { + "epoch": 0.4745222929936306, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2478861808776855, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6968926191329956, + "num_tokens": 111780595.0, + "step": 4321 + }, + { + "epoch": 0.4746321106962442, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.173632860183716, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6951415538787842, + "num_tokens": 111808453.0, + "step": 4322 + }, + { + "epoch": 0.47474192839885787, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.26873779296875, + "learning_rate": 1e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.677162230014801, + "num_tokens": 111838333.0, + "step": 4323 + }, + { + "epoch": 0.4748517461014716, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4072952270507812, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7012815475463867, + "num_tokens": 111862836.0, + "step": 4324 + }, + { + "epoch": 0.4749615638040852, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3780677318573, + "learning_rate": 1e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6717772483825684, + "num_tokens": 111889887.0, + "step": 4325 + }, + { + "epoch": 0.47507138150669886, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.438281297683716, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6879991292953491, + "num_tokens": 111916363.0, + "step": 4326 + }, + { + "epoch": 0.47518119920931257, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.125731945037842, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7116733193397522, + "num_tokens": 111947248.0, + "step": 4327 + }, + { + "epoch": 0.4752910169119262, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.091102123260498, + "learning_rate": 1e-06, + "loss": 1.0981, + "mean_token_accuracy": 0.6854274868965149, + "num_tokens": 111979006.0, + "step": 4328 + }, + { + "epoch": 0.47540083461453986, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4308478832244873, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7193107008934021, + "num_tokens": 112001590.0, + "step": 4329 + }, + { + "epoch": 0.4755106523171535, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 7.195903778076172, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7078806161880493, + "num_tokens": 112022137.0, + "step": 4330 + }, + { + "epoch": 0.4756204700197672, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.412942409515381, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7258056998252869, + "num_tokens": 112047019.0, + "step": 4331 + }, + { + "epoch": 0.47573028772238085, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2618348598480225, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6907559633255005, + "num_tokens": 112073720.0, + "step": 4332 + }, + { + "epoch": 0.4758401054249945, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.3535985946655273, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7009519934654236, + "num_tokens": 112100905.0, + "step": 4333 + }, + { + "epoch": 0.47594992312760814, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.180903911590576, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7027700543403625, + "num_tokens": 112127185.0, + "step": 4334 + }, + { + "epoch": 0.47605974083022184, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.5426950454711914, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7147488594055176, + "num_tokens": 112148344.0, + "step": 4335 + }, + { + "epoch": 0.4761695585328355, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4450910091400146, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6950826644897461, + "num_tokens": 112172753.0, + "step": 4336 + }, + { + "epoch": 0.47627937623544914, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.4426887035369873, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.7052044868469238, + "num_tokens": 112195835.0, + "step": 4337 + }, + { + "epoch": 0.47638919393806284, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.172510862350464, + "learning_rate": 1e-06, + "loss": 1.1073, + "mean_token_accuracy": 0.674565315246582, + "num_tokens": 112225281.0, + "step": 4338 + }, + { + "epoch": 0.4764990116406765, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.400132894515991, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7002638578414917, + "num_tokens": 112249829.0, + "step": 4339 + }, + { + "epoch": 0.47660882934329013, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.1270675659179688, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6912182569503784, + "num_tokens": 112279445.0, + "step": 4340 + }, + { + "epoch": 0.4767186470459038, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.146080255508423, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6900859475135803, + "num_tokens": 112309786.0, + "step": 4341 + }, + { + "epoch": 0.4768284647485175, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.404137134552002, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6839436888694763, + "num_tokens": 112338737.0, + "step": 4342 + }, + { + "epoch": 0.4769382824511311, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.9257566928863525, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6840701103210449, + "num_tokens": 112376012.0, + "step": 4343 + }, + { + "epoch": 0.47704810015374477, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.6249449253082275, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6906678080558777, + "num_tokens": 112398477.0, + "step": 4344 + }, + { + "epoch": 0.47715791785635847, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.480384349822998, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7013466358184814, + "num_tokens": 112421259.0, + "step": 4345 + }, + { + "epoch": 0.4772677355589721, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.424466133117676, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6850407719612122, + "num_tokens": 112445827.0, + "step": 4346 + }, + { + "epoch": 0.47737755326158576, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.7164201736450195, + "learning_rate": 1e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6824001669883728, + "num_tokens": 112468017.0, + "step": 4347 + }, + { + "epoch": 0.4774873709641994, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.442413330078125, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6782501935958862, + "num_tokens": 112493136.0, + "step": 4348 + }, + { + "epoch": 0.4775971886668131, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.307321786880493, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6891202330589294, + "num_tokens": 112517619.0, + "step": 4349 + }, + { + "epoch": 0.47770700636942676, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.125340223312378, + "learning_rate": 1e-06, + "loss": 1.152, + "mean_token_accuracy": 0.669058084487915, + "num_tokens": 112553523.0, + "step": 4350 + }, + { + "epoch": 0.4778168240720404, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2974815368652344, + "learning_rate": 1e-06, + "loss": 1.1895, + "mean_token_accuracy": 0.6553062796592712, + "num_tokens": 112581580.0, + "step": 4351 + }, + { + "epoch": 0.47792664177465405, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.2323038578033447, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6826788187026978, + "num_tokens": 112612031.0, + "step": 4352 + }, + { + "epoch": 0.47803645947726775, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.079444169998169, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7061601281166077, + "num_tokens": 112642814.0, + "step": 4353 + }, + { + "epoch": 0.4781462771798814, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.538966178894043, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7037258148193359, + "num_tokens": 112665706.0, + "step": 4354 + }, + { + "epoch": 0.47825609488249504, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.36006760597229, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6907109022140503, + "num_tokens": 112690358.0, + "step": 4355 + }, + { + "epoch": 0.47836591258510874, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 2.112048864364624, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6938063502311707, + "num_tokens": 112719946.0, + "step": 4356 + }, + { + "epoch": 0.4784757302877224, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 7.033819675445557, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6924304962158203, + "num_tokens": 112744322.0, + "step": 4357 + }, + { + "epoch": 0.47858554799033604, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.323428153991699, + "learning_rate": 1e-06, + "loss": 1.1313, + "mean_token_accuracy": 0.6710484623908997, + "num_tokens": 112775838.0, + "step": 4358 + }, + { + "epoch": 0.4786953656929497, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2676048278808594, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7104080319404602, + "num_tokens": 112803649.0, + "step": 4359 + }, + { + "epoch": 0.4788051833955634, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.319949150085449, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6827713251113892, + "num_tokens": 112831812.0, + "step": 4360 + }, + { + "epoch": 0.47891500109817703, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2520253658294678, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7236911058425903, + "num_tokens": 112858557.0, + "step": 4361 + }, + { + "epoch": 0.4790248188007907, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.224299669265747, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.703772246837616, + "num_tokens": 112884800.0, + "step": 4362 + }, + { + "epoch": 0.4791346365034044, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.240833044052124, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6995313167572021, + "num_tokens": 112913848.0, + "step": 4363 + }, + { + "epoch": 0.479244454206018, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.4280142784118652, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7029114365577698, + "num_tokens": 112939866.0, + "step": 4364 + }, + { + "epoch": 0.47935427190863167, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.329015016555786, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7340056896209717, + "num_tokens": 112965056.0, + "step": 4365 + }, + { + "epoch": 0.4794640896112453, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2672669887542725, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6864919066429138, + "num_tokens": 112993346.0, + "step": 4366 + }, + { + "epoch": 0.479573907313859, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.3598499298095703, + "learning_rate": 1e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.6810184717178345, + "num_tokens": 113020155.0, + "step": 4367 + }, + { + "epoch": 0.47968372501647266, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.3702685832977295, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6949293613433838, + "num_tokens": 113044250.0, + "step": 4368 + }, + { + "epoch": 0.4797935427190863, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.279785394668579, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6932623386383057, + "num_tokens": 113072237.0, + "step": 4369 + }, + { + "epoch": 0.47990336042169995, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.419060468673706, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7055186033248901, + "num_tokens": 113096600.0, + "step": 4370 + }, + { + "epoch": 0.48001317812431366, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.1458170413970947, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6925776600837708, + "num_tokens": 113131476.0, + "step": 4371 + }, + { + "epoch": 0.4801229958269273, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.1929829120635986, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6862379312515259, + "num_tokens": 113161614.0, + "step": 4372 + }, + { + "epoch": 0.48023281352954095, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.49809193611145, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6980953216552734, + "num_tokens": 113184723.0, + "step": 4373 + }, + { + "epoch": 0.48034263123215465, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.399782657623291, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6941582560539246, + "num_tokens": 113207737.0, + "step": 4374 + }, + { + "epoch": 0.4804524489347683, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.3308629989624023, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6958364248275757, + "num_tokens": 113236517.0, + "step": 4375 + }, + { + "epoch": 0.48056226663738194, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.4117488861083984, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7133457064628601, + "num_tokens": 113258955.0, + "step": 4376 + }, + { + "epoch": 0.4806720843399956, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.257580280303955, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6977494955062866, + "num_tokens": 113287178.0, + "step": 4377 + }, + { + "epoch": 0.4807819020426093, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2991976737976074, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7057321071624756, + "num_tokens": 113314384.0, + "step": 4378 + }, + { + "epoch": 0.48089171974522293, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.566488742828369, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6929975748062134, + "num_tokens": 113335624.0, + "step": 4379 + }, + { + "epoch": 0.4810015374478366, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.328155994415283, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6819104552268982, + "num_tokens": 113364171.0, + "step": 4380 + }, + { + "epoch": 0.4811113551504502, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.701106309890747, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.711392879486084, + "num_tokens": 113385080.0, + "step": 4381 + }, + { + "epoch": 0.4812211728530639, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2555294036865234, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6964610815048218, + "num_tokens": 113413851.0, + "step": 4382 + }, + { + "epoch": 0.4813309905556776, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.242035150527954, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6804027557373047, + "num_tokens": 113444131.0, + "step": 4383 + }, + { + "epoch": 0.4814408082582912, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.2257795333862305, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6866490840911865, + "num_tokens": 113473839.0, + "step": 4384 + }, + { + "epoch": 0.4815506259609049, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.5088369846343994, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6900711059570312, + "num_tokens": 113496652.0, + "step": 4385 + }, + { + "epoch": 0.48166044366351857, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.595417022705078, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6979902982711792, + "num_tokens": 113517813.0, + "step": 4386 + }, + { + "epoch": 0.4817702613661322, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.3457469940185547, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6888096332550049, + "num_tokens": 113544589.0, + "step": 4387 + }, + { + "epoch": 0.48188007906874586, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.171523332595825, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6937720775604248, + "num_tokens": 113575425.0, + "step": 4388 + }, + { + "epoch": 0.48198989677135956, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 2.3143012523651123, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7124807834625244, + "num_tokens": 113600542.0, + "step": 4389 + }, + { + "epoch": 0.4820997144739732, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.3186771869659424, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7062228918075562, + "num_tokens": 113626938.0, + "step": 4390 + }, + { + "epoch": 0.48220953217658685, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.3398780822753906, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6924571394920349, + "num_tokens": 113653288.0, + "step": 4391 + }, + { + "epoch": 0.48231934987920055, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.3184478282928467, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6985224485397339, + "num_tokens": 113680097.0, + "step": 4392 + }, + { + "epoch": 0.4824291675818142, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.5430748462677, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6921617388725281, + "num_tokens": 113703073.0, + "step": 4393 + }, + { + "epoch": 0.48253898528442785, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.5028226375579834, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7036255598068237, + "num_tokens": 113727831.0, + "step": 4394 + }, + { + "epoch": 0.4826488029870415, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.538311004638672, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.7009498476982117, + "num_tokens": 113750652.0, + "step": 4395 + }, + { + "epoch": 0.4827586206896552, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.323113203048706, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6980807185173035, + "num_tokens": 113777924.0, + "step": 4396 + }, + { + "epoch": 0.48286843839226884, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.5021941661834717, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7128646969795227, + "num_tokens": 113801616.0, + "step": 4397 + }, + { + "epoch": 0.4829782560948825, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.589691400527954, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6833552718162537, + "num_tokens": 113827240.0, + "step": 4398 + }, + { + "epoch": 0.48308807379749613, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.6527578830718994, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7314658761024475, + "num_tokens": 113847276.0, + "step": 4399 + }, + { + "epoch": 0.48319789150010983, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.1964876651763916, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.7010839581489563, + "num_tokens": 113876130.0, + "step": 4400 + }, + { + "epoch": 0.4833077092027235, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.4080612659454346, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.7044386863708496, + "num_tokens": 113900524.0, + "step": 4401 + }, + { + "epoch": 0.4834175269053371, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.6335525512695312, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7141101360321045, + "num_tokens": 113920621.0, + "step": 4402 + }, + { + "epoch": 0.4835273446079508, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.5115714073181152, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7213684916496277, + "num_tokens": 113942610.0, + "step": 4403 + }, + { + "epoch": 0.4836371623105645, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.3435728549957275, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6856564283370972, + "num_tokens": 113968029.0, + "step": 4404 + }, + { + "epoch": 0.4837469800131781, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.6476821899414062, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7108420133590698, + "num_tokens": 113992927.0, + "step": 4405 + }, + { + "epoch": 0.48385679771579176, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.38903546333313, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6941415667533875, + "num_tokens": 114018573.0, + "step": 4406 + }, + { + "epoch": 0.48396661541840547, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.265031099319458, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.683817446231842, + "num_tokens": 114048655.0, + "step": 4407 + }, + { + "epoch": 0.4840764331210191, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.392106533050537, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.693747878074646, + "num_tokens": 114071771.0, + "step": 4408 + }, + { + "epoch": 0.48418625082363276, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.5309255123138428, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.7034841179847717, + "num_tokens": 114094334.0, + "step": 4409 + }, + { + "epoch": 0.4842960685262464, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.4683055877685547, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7087172269821167, + "num_tokens": 114118168.0, + "step": 4410 + }, + { + "epoch": 0.4844058862288601, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.7628934383392334, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6924055814743042, + "num_tokens": 114138611.0, + "step": 4411 + }, + { + "epoch": 0.48451570393147375, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.409142255783081, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.717718243598938, + "num_tokens": 114160678.0, + "step": 4412 + }, + { + "epoch": 0.4846255216340874, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.6424145698547363, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7298179864883423, + "num_tokens": 114180098.0, + "step": 4413 + }, + { + "epoch": 0.4847353393367011, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.502983808517456, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7192183136940002, + "num_tokens": 114203210.0, + "step": 4414 + }, + { + "epoch": 0.48484515703931474, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.1503260135650635, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.7022466659545898, + "num_tokens": 114232636.0, + "step": 4415 + }, + { + "epoch": 0.4849549747419284, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.537938356399536, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6974844336509705, + "num_tokens": 114256563.0, + "step": 4416 + }, + { + "epoch": 0.48506479244454204, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.687706470489502, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6973405480384827, + "num_tokens": 114282338.0, + "step": 4417 + }, + { + "epoch": 0.48517461014715574, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.444810628890991, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6894224882125854, + "num_tokens": 114306573.0, + "step": 4418 + }, + { + "epoch": 0.4852844278497694, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.277625322341919, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7110862731933594, + "num_tokens": 114330804.0, + "step": 4419 + }, + { + "epoch": 0.48539424555238303, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.103219509124756, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6802853345870972, + "num_tokens": 114362071.0, + "step": 4420 + }, + { + "epoch": 0.48550406325499673, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.6007964611053467, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6823282241821289, + "num_tokens": 114386985.0, + "step": 4421 + }, + { + "epoch": 0.4856138809576104, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.5851690769195557, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7104517817497253, + "num_tokens": 114408362.0, + "step": 4422 + }, + { + "epoch": 0.485723698660224, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.1718668937683105, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6920620203018188, + "num_tokens": 114437239.0, + "step": 4423 + }, + { + "epoch": 0.48583351636283767, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.251211643218994, + "learning_rate": 1e-06, + "loss": 1.0886, + "mean_token_accuracy": 0.6783658266067505, + "num_tokens": 114466495.0, + "step": 4424 + }, + { + "epoch": 0.48594333406545137, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.6501495838165283, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7127218246459961, + "num_tokens": 114486989.0, + "step": 4425 + }, + { + "epoch": 0.486053151768065, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.4625113010406494, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.697393536567688, + "num_tokens": 114509544.0, + "step": 4426 + }, + { + "epoch": 0.48616296947067866, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.411416530609131, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6878788471221924, + "num_tokens": 114533663.0, + "step": 4427 + }, + { + "epoch": 0.4862727871732923, + "ewc_loss": 1.1086463928222656e-05, + "grad_norm": 2.141414165496826, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6797927021980286, + "num_tokens": 114562909.0, + "step": 4428 + }, + { + "epoch": 0.486382604875906, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.4937057495117188, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.684842586517334, + "num_tokens": 114585352.0, + "step": 4429 + }, + { + "epoch": 0.48649242257851966, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.486691951751709, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7157377600669861, + "num_tokens": 114607829.0, + "step": 4430 + }, + { + "epoch": 0.4866022402811333, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.936532735824585, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7184617519378662, + "num_tokens": 114635523.0, + "step": 4431 + }, + { + "epoch": 0.486712057983747, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.315598249435425, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7164852619171143, + "num_tokens": 114660260.0, + "step": 4432 + }, + { + "epoch": 0.48682187568636065, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.246194362640381, + "learning_rate": 1e-06, + "loss": 1.0801, + "mean_token_accuracy": 0.6819159984588623, + "num_tokens": 114688684.0, + "step": 4433 + }, + { + "epoch": 0.4869316933889743, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5095672607421875, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7165281772613525, + "num_tokens": 114711677.0, + "step": 4434 + }, + { + "epoch": 0.48704151109158794, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.3949105739593506, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6983503699302673, + "num_tokens": 114735640.0, + "step": 4435 + }, + { + "epoch": 0.48715132879420164, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.1882777214050293, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.684952974319458, + "num_tokens": 114765799.0, + "step": 4436 + }, + { + "epoch": 0.4872611464968153, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.0680699348449707, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6864453554153442, + "num_tokens": 114795705.0, + "step": 4437 + }, + { + "epoch": 0.48737096419942894, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.425828456878662, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6923831701278687, + "num_tokens": 114820558.0, + "step": 4438 + }, + { + "epoch": 0.48748078190204264, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.285435199737549, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6983528137207031, + "num_tokens": 114847518.0, + "step": 4439 + }, + { + "epoch": 0.4875905996046563, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.19463849067688, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6906042098999023, + "num_tokens": 114875666.0, + "step": 4440 + }, + { + "epoch": 0.48770041730726993, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.3754701614379883, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6976948976516724, + "num_tokens": 114906232.0, + "step": 4441 + }, + { + "epoch": 0.4878102350098836, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.567188024520874, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7042237520217896, + "num_tokens": 114926526.0, + "step": 4442 + }, + { + "epoch": 0.4879200527124973, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.5211362838745117, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.684136152267456, + "num_tokens": 114954475.0, + "step": 4443 + }, + { + "epoch": 0.4880298704151109, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.483400821685791, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6892874240875244, + "num_tokens": 114981638.0, + "step": 4444 + }, + { + "epoch": 0.48813968811772457, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.3829941749572754, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7177538871765137, + "num_tokens": 115006462.0, + "step": 4445 + }, + { + "epoch": 0.4882495058203382, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.3509905338287354, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7147853374481201, + "num_tokens": 115031636.0, + "step": 4446 + }, + { + "epoch": 0.4883593235229519, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.4634692668914795, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7075979709625244, + "num_tokens": 115056335.0, + "step": 4447 + }, + { + "epoch": 0.48846914122556556, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.560403347015381, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6927221417427063, + "num_tokens": 115078790.0, + "step": 4448 + }, + { + "epoch": 0.4885789589281792, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.5895907878875732, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7156053185462952, + "num_tokens": 115101699.0, + "step": 4449 + }, + { + "epoch": 0.4886887766307929, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.412783622741699, + "learning_rate": 1e-06, + "loss": 1.1169, + "mean_token_accuracy": 0.6741990447044373, + "num_tokens": 115128503.0, + "step": 4450 + }, + { + "epoch": 0.48879859433340656, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.045297861099243, + "learning_rate": 1e-06, + "loss": 1.1094, + "mean_token_accuracy": 0.6831973791122437, + "num_tokens": 115164634.0, + "step": 4451 + }, + { + "epoch": 0.4889084120360202, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 2.5596230030059814, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6881182193756104, + "num_tokens": 115187246.0, + "step": 4452 + }, + { + "epoch": 0.48901822973863385, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.31009578704834, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7039700746536255, + "num_tokens": 115213586.0, + "step": 4453 + }, + { + "epoch": 0.48912804744124755, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5750014781951904, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7223026752471924, + "num_tokens": 115235227.0, + "step": 4454 + }, + { + "epoch": 0.4892378651438612, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.1797258853912354, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.704564094543457, + "num_tokens": 115263710.0, + "step": 4455 + }, + { + "epoch": 0.48934768284647484, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.3470587730407715, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7099927663803101, + "num_tokens": 115289195.0, + "step": 4456 + }, + { + "epoch": 0.4894575005490885, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.371910810470581, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.699927031993866, + "num_tokens": 115313974.0, + "step": 4457 + }, + { + "epoch": 0.4895673182517022, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4133827686309814, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6967051029205322, + "num_tokens": 115339114.0, + "step": 4458 + }, + { + "epoch": 0.48967713595431583, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.3422367572784424, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6940221786499023, + "num_tokens": 115365289.0, + "step": 4459 + }, + { + "epoch": 0.4897869536569295, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4836034774780273, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7050246000289917, + "num_tokens": 115387444.0, + "step": 4460 + }, + { + "epoch": 0.4898967713595432, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4966371059417725, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6948763132095337, + "num_tokens": 115409230.0, + "step": 4461 + }, + { + "epoch": 0.4900065890621568, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5798263549804688, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7150379419326782, + "num_tokens": 115430694.0, + "step": 4462 + }, + { + "epoch": 0.4901164067647705, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4193670749664307, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6834622621536255, + "num_tokens": 115456685.0, + "step": 4463 + }, + { + "epoch": 0.4902262244673841, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.2571802139282227, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6888742446899414, + "num_tokens": 115483755.0, + "step": 4464 + }, + { + "epoch": 0.4903360421699978, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.6996371746063232, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.72139573097229, + "num_tokens": 115502535.0, + "step": 4465 + }, + { + "epoch": 0.49044585987261147, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4354779720306396, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6988545060157776, + "num_tokens": 115526350.0, + "step": 4466 + }, + { + "epoch": 0.4905556775752251, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5894505977630615, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7130906581878662, + "num_tokens": 115548180.0, + "step": 4467 + }, + { + "epoch": 0.4906654952778388, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.2467775344848633, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6936924457550049, + "num_tokens": 115575251.0, + "step": 4468 + }, + { + "epoch": 0.49077531298045246, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5019330978393555, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.727197527885437, + "num_tokens": 115595870.0, + "step": 4469 + }, + { + "epoch": 0.4908851306830661, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.2970829010009766, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6859886646270752, + "num_tokens": 115622500.0, + "step": 4470 + }, + { + "epoch": 0.49099494838567975, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5743470191955566, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6884106397628784, + "num_tokens": 115644812.0, + "step": 4471 + }, + { + "epoch": 0.49110476608829345, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.398869276046753, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.709825873374939, + "num_tokens": 115668225.0, + "step": 4472 + }, + { + "epoch": 0.4912145837909071, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.553069829940796, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7208020687103271, + "num_tokens": 115690591.0, + "step": 4473 + }, + { + "epoch": 0.49132440149352075, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.5249342918395996, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.683122992515564, + "num_tokens": 115715069.0, + "step": 4474 + }, + { + "epoch": 0.4914342191961344, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 2.534559965133667, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7007935047149658, + "num_tokens": 115737035.0, + "step": 4475 + }, + { + "epoch": 0.4915440368987481, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 2.2739522457122803, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.7033205032348633, + "num_tokens": 115762455.0, + "step": 4476 + }, + { + "epoch": 0.49165385460136174, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 2.4324259757995605, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7050439715385437, + "num_tokens": 115786117.0, + "step": 4477 + }, + { + "epoch": 0.4917636723039754, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 2.6163330078125, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7248901128768921, + "num_tokens": 115809341.0, + "step": 4478 + }, + { + "epoch": 0.4918734900065891, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 2.5165891647338867, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7031959295272827, + "num_tokens": 115832269.0, + "step": 4479 + }, + { + "epoch": 0.49198330770920273, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3101303577423096, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6986136436462402, + "num_tokens": 115859033.0, + "step": 4480 + }, + { + "epoch": 0.4920931254118164, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4697976112365723, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.70450758934021, + "num_tokens": 115881543.0, + "step": 4481 + }, + { + "epoch": 0.49220294311443, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.324702501296997, + "learning_rate": 1e-06, + "loss": 1.1147, + "mean_token_accuracy": 0.6806052923202515, + "num_tokens": 115908358.0, + "step": 4482 + }, + { + "epoch": 0.4923127608170437, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3376736640930176, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.701552152633667, + "num_tokens": 115934190.0, + "step": 4483 + }, + { + "epoch": 0.4924225785196574, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.411348581314087, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6816550493240356, + "num_tokens": 115958843.0, + "step": 4484 + }, + { + "epoch": 0.492532396222271, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.2978692054748535, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.683645486831665, + "num_tokens": 115984943.0, + "step": 4485 + }, + { + "epoch": 0.49264221392488466, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.192544460296631, + "learning_rate": 1e-06, + "loss": 1.1407, + "mean_token_accuracy": 0.6760193109512329, + "num_tokens": 116015263.0, + "step": 4486 + }, + { + "epoch": 0.49275203162749837, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.262274980545044, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6886060833930969, + "num_tokens": 116042471.0, + "step": 4487 + }, + { + "epoch": 0.492861849330112, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 2.4669041633605957, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.7023743391036987, + "num_tokens": 116067499.0, + "step": 4488 + }, + { + "epoch": 0.49297166703272566, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4933037757873535, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7161675691604614, + "num_tokens": 116088440.0, + "step": 4489 + }, + { + "epoch": 0.49308148473533936, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.5552351474761963, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6849493384361267, + "num_tokens": 116111112.0, + "step": 4490 + }, + { + "epoch": 0.493191302437953, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4083304405212402, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7000418305397034, + "num_tokens": 116135455.0, + "step": 4491 + }, + { + "epoch": 0.49330112014056665, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4349844455718994, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6858272552490234, + "num_tokens": 116160890.0, + "step": 4492 + }, + { + "epoch": 0.4934109378431803, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.343897581100464, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7118769884109497, + "num_tokens": 116187808.0, + "step": 4493 + }, + { + "epoch": 0.493520755545794, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2353055477142334, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6867823600769043, + "num_tokens": 116217838.0, + "step": 4494 + }, + { + "epoch": 0.49363057324840764, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2258172035217285, + "learning_rate": 1e-06, + "loss": 1.0982, + "mean_token_accuracy": 0.6866453886032104, + "num_tokens": 116246981.0, + "step": 4495 + }, + { + "epoch": 0.4937403909510213, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.229785680770874, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.67646324634552, + "num_tokens": 116276532.0, + "step": 4496 + }, + { + "epoch": 0.493850208653635, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3624649047851562, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7061581611633301, + "num_tokens": 116302544.0, + "step": 4497 + }, + { + "epoch": 0.49396002635624864, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.544323682785034, + "learning_rate": 1e-06, + "loss": 1.1353, + "mean_token_accuracy": 0.6715187430381775, + "num_tokens": 116326255.0, + "step": 4498 + }, + { + "epoch": 0.4940698440588623, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.346736192703247, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7031459808349609, + "num_tokens": 116351717.0, + "step": 4499 + }, + { + "epoch": 0.49417966176147593, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.241288185119629, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.688837468624115, + "num_tokens": 116381049.0, + "step": 4500 + }, + { + "epoch": 0.49428947946408963, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.480562448501587, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7113476395606995, + "num_tokens": 116404558.0, + "step": 4501 + }, + { + "epoch": 0.4943992971667033, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4152235984802246, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6925737857818604, + "num_tokens": 116430821.0, + "step": 4502 + }, + { + "epoch": 0.4945091148693169, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.332282781600952, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7128145694732666, + "num_tokens": 116456843.0, + "step": 4503 + }, + { + "epoch": 0.49461893257193057, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2806153297424316, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7001902461051941, + "num_tokens": 116488522.0, + "step": 4504 + }, + { + "epoch": 0.49472875027454427, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.082991361618042, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6881269216537476, + "num_tokens": 116518860.0, + "step": 4505 + }, + { + "epoch": 0.4948385679771579, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.400909662246704, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7118797302246094, + "num_tokens": 116545374.0, + "step": 4506 + }, + { + "epoch": 0.49494838567977156, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3279850482940674, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7019047737121582, + "num_tokens": 116572772.0, + "step": 4507 + }, + { + "epoch": 0.49505820338238526, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.1946537494659424, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6933788657188416, + "num_tokens": 116601968.0, + "step": 4508 + }, + { + "epoch": 0.4951680210849989, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.565495491027832, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6973862648010254, + "num_tokens": 116624913.0, + "step": 4509 + }, + { + "epoch": 0.49527783878761256, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.5482892990112305, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.695834755897522, + "num_tokens": 116646611.0, + "step": 4510 + }, + { + "epoch": 0.4953876564902262, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.485943555831909, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6939601302146912, + "num_tokens": 116669267.0, + "step": 4511 + }, + { + "epoch": 0.4954974741928399, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.430800676345825, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7062897086143494, + "num_tokens": 116694288.0, + "step": 4512 + }, + { + "epoch": 0.49560729189545355, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3615424633026123, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.711036205291748, + "num_tokens": 116719128.0, + "step": 4513 + }, + { + "epoch": 0.4957171095980672, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4470460414886475, + "learning_rate": 1e-06, + "loss": 1.1093, + "mean_token_accuracy": 0.6851381659507751, + "num_tokens": 116745096.0, + "step": 4514 + }, + { + "epoch": 0.4958269273006809, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.784270763397217, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7111161351203918, + "num_tokens": 116763826.0, + "step": 4515 + }, + { + "epoch": 0.49593674500329454, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2637252807617188, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7007436752319336, + "num_tokens": 116795641.0, + "step": 4516 + }, + { + "epoch": 0.4960465627059082, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.3176841735839844, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6869869828224182, + "num_tokens": 116822911.0, + "step": 4517 + }, + { + "epoch": 0.49615638040852184, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 2.477905511856079, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7002525925636292, + "num_tokens": 116846246.0, + "step": 4518 + }, + { + "epoch": 0.49626619811113554, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.159489393234253, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6981773972511292, + "num_tokens": 116874624.0, + "step": 4519 + }, + { + "epoch": 0.4963760158137492, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2454864978790283, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7138825058937073, + "num_tokens": 116899231.0, + "step": 4520 + }, + { + "epoch": 0.49648583351636283, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4994473457336426, + "learning_rate": 1e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.6709226369857788, + "num_tokens": 116924433.0, + "step": 4521 + }, + { + "epoch": 0.4965956512189765, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2812864780426025, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6836867332458496, + "num_tokens": 116954796.0, + "step": 4522 + }, + { + "epoch": 0.4967054689215902, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.1246252059936523, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7022374868392944, + "num_tokens": 116984925.0, + "step": 4523 + }, + { + "epoch": 0.4968152866242038, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2406013011932373, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6956214904785156, + "num_tokens": 117012418.0, + "step": 4524 + }, + { + "epoch": 0.49692510432681747, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.2026495933532715, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7128357887268066, + "num_tokens": 117040062.0, + "step": 4525 + }, + { + "epoch": 0.49703492202943117, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.112517833709717, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6860489845275879, + "num_tokens": 117069854.0, + "step": 4526 + }, + { + "epoch": 0.4971447397320448, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.156489372253418, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6989124417304993, + "num_tokens": 117098015.0, + "step": 4527 + }, + { + "epoch": 0.49725455743465846, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.3916187286376953, + "learning_rate": 1e-06, + "loss": 1.0997, + "mean_token_accuracy": 0.6802518367767334, + "num_tokens": 117127797.0, + "step": 4528 + }, + { + "epoch": 0.4973643751372721, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 2.540407180786133, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6937094926834106, + "num_tokens": 117147759.0, + "step": 4529 + }, + { + "epoch": 0.4974741928398858, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 2.371060371398926, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6904729008674622, + "num_tokens": 117171512.0, + "step": 4530 + }, + { + "epoch": 0.49758401054249946, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.3915200233459473, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6894064545631409, + "num_tokens": 117196904.0, + "step": 4531 + }, + { + "epoch": 0.4976938282451131, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.6262807846069336, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7047709822654724, + "num_tokens": 117218115.0, + "step": 4532 + }, + { + "epoch": 0.49780364594772675, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2252583503723145, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7089889049530029, + "num_tokens": 117244201.0, + "step": 4533 + }, + { + "epoch": 0.49791346365034045, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.326056480407715, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7324808239936829, + "num_tokens": 117267186.0, + "step": 4534 + }, + { + "epoch": 0.4980232813529541, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.1168971061706543, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6934697031974792, + "num_tokens": 117297522.0, + "step": 4535 + }, + { + "epoch": 0.49813309905556774, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 2.4184610843658447, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6912316083908081, + "num_tokens": 117323164.0, + "step": 4536 + }, + { + "epoch": 0.49824291675818144, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.4380152225494385, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6982784271240234, + "num_tokens": 117347079.0, + "step": 4537 + }, + { + "epoch": 0.4983527344607951, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.4040870666503906, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7048439979553223, + "num_tokens": 117373678.0, + "step": 4538 + }, + { + "epoch": 0.49846255216340873, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2758233547210693, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6893937587738037, + "num_tokens": 117400721.0, + "step": 4539 + }, + { + "epoch": 0.4985723698660224, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.3856160640716553, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7104994654655457, + "num_tokens": 117425568.0, + "step": 4540 + }, + { + "epoch": 0.4986821875686361, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.570232629776001, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6980471611022949, + "num_tokens": 117448642.0, + "step": 4541 + }, + { + "epoch": 0.4987920052712497, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.579192638397217, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6999230980873108, + "num_tokens": 117471867.0, + "step": 4542 + }, + { + "epoch": 0.4989018229738634, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.0712435245513916, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6988577842712402, + "num_tokens": 117503893.0, + "step": 4543 + }, + { + "epoch": 0.4990116406764771, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2708301544189453, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6923947930335999, + "num_tokens": 117532300.0, + "step": 4544 + }, + { + "epoch": 0.4991214583790907, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.346027135848999, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7093098163604736, + "num_tokens": 117557057.0, + "step": 4545 + }, + { + "epoch": 0.49923127608170437, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2287724018096924, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6925981044769287, + "num_tokens": 117586763.0, + "step": 4546 + }, + { + "epoch": 0.499341093784318, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.331173896789551, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.7042711973190308, + "num_tokens": 117611783.0, + "step": 4547 + }, + { + "epoch": 0.4994509114869317, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2883694171905518, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7256338596343994, + "num_tokens": 117638085.0, + "step": 4548 + }, + { + "epoch": 0.49956072918954536, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.273136615753174, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6939689517021179, + "num_tokens": 117669103.0, + "step": 4549 + }, + { + "epoch": 0.499670546892159, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.436809778213501, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6864437460899353, + "num_tokens": 117693830.0, + "step": 4550 + }, + { + "epoch": 0.49978036459477265, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.634265422821045, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6907360553741455, + "num_tokens": 117714230.0, + "step": 4551 + }, + { + "epoch": 0.49989018229738635, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.0914103984832764, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6912274956703186, + "num_tokens": 117747833.0, + "step": 4552 + }, + { + "epoch": 0.5, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.5457048416137695, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.683005154132843, + "num_tokens": 117773052.0, + "step": 4553 + }, + { + "epoch": 0.5001098177026136, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.367161750793457, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7037645578384399, + "num_tokens": 117798234.0, + "step": 4554 + }, + { + "epoch": 0.5002196354052273, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.675645112991333, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7064056992530823, + "num_tokens": 117820361.0, + "step": 4555 + }, + { + "epoch": 0.5003294531078409, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.460221290588379, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6862348914146423, + "num_tokens": 117846772.0, + "step": 4556 + }, + { + "epoch": 0.5004392708104547, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.21185040473938, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6892738938331604, + "num_tokens": 117877864.0, + "step": 4557 + }, + { + "epoch": 0.5005490885130683, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.280519485473633, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6788252592086792, + "num_tokens": 117905908.0, + "step": 4558 + }, + { + "epoch": 0.500658906215682, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.4495863914489746, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.698431134223938, + "num_tokens": 117931453.0, + "step": 4559 + }, + { + "epoch": 0.5007687239182956, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.691049098968506, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7059645652770996, + "num_tokens": 117953672.0, + "step": 4560 + }, + { + "epoch": 0.5008785416209093, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.32596492767334, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6854121088981628, + "num_tokens": 117979254.0, + "step": 4561 + }, + { + "epoch": 0.5009883593235229, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.3324742317199707, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.702476978302002, + "num_tokens": 118007249.0, + "step": 4562 + }, + { + "epoch": 0.5010981770261366, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.196572780609131, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6798808574676514, + "num_tokens": 118038228.0, + "step": 4563 + }, + { + "epoch": 0.5012079947287503, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2962212562561035, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.7010116577148438, + "num_tokens": 118064982.0, + "step": 4564 + }, + { + "epoch": 0.501317812431364, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.419769763946533, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7025416493415833, + "num_tokens": 118088105.0, + "step": 4565 + }, + { + "epoch": 0.5014276301339776, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.0868842601776123, + "learning_rate": 1e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6942660808563232, + "num_tokens": 118123176.0, + "step": 4566 + }, + { + "epoch": 0.5015374478365913, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.307098388671875, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6999467611312866, + "num_tokens": 118150578.0, + "step": 4567 + }, + { + "epoch": 0.5016472655392049, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.286280393600464, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7213567495346069, + "num_tokens": 118177358.0, + "step": 4568 + }, + { + "epoch": 0.5017570832418186, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2451093196868896, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6983581185340881, + "num_tokens": 118207315.0, + "step": 4569 + }, + { + "epoch": 0.5018669009444322, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.324214220046997, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6977765560150146, + "num_tokens": 118231938.0, + "step": 4570 + }, + { + "epoch": 0.5019767186470458, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.1992950439453125, + "learning_rate": 1e-06, + "loss": 1.1313, + "mean_token_accuracy": 0.6681985259056091, + "num_tokens": 118264767.0, + "step": 4571 + }, + { + "epoch": 0.5020865363496596, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.765798330307007, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6851457357406616, + "num_tokens": 118287546.0, + "step": 4572 + }, + { + "epoch": 0.5021963540522733, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.4707295894622803, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7092036008834839, + "num_tokens": 118310519.0, + "step": 4573 + }, + { + "epoch": 0.5023061717548869, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2574944496154785, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6994086503982544, + "num_tokens": 118335584.0, + "step": 4574 + }, + { + "epoch": 0.5024159894575005, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.556968927383423, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.713962197303772, + "num_tokens": 118355882.0, + "step": 4575 + }, + { + "epoch": 0.5025258071601142, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.473088264465332, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6970458030700684, + "num_tokens": 118379758.0, + "step": 4576 + }, + { + "epoch": 0.5026356248627278, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2678627967834473, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6923937797546387, + "num_tokens": 118409998.0, + "step": 4577 + }, + { + "epoch": 0.5027454425653415, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.177656412124634, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7004259824752808, + "num_tokens": 118437895.0, + "step": 4578 + }, + { + "epoch": 0.5028552602679552, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.210462808609009, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6932206153869629, + "num_tokens": 118466254.0, + "step": 4579 + }, + { + "epoch": 0.5029650779705689, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.4113588333129883, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6845062375068665, + "num_tokens": 118491266.0, + "step": 4580 + }, + { + "epoch": 0.5030748956731825, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.711153507232666, + "learning_rate": 1e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6863291263580322, + "num_tokens": 118513453.0, + "step": 4581 + }, + { + "epoch": 0.5031847133757962, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.555555820465088, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6866111755371094, + "num_tokens": 118536240.0, + "step": 4582 + }, + { + "epoch": 0.5032945310784098, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.256666660308838, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6941236257553101, + "num_tokens": 118566445.0, + "step": 4583 + }, + { + "epoch": 0.5034043487810235, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3771839141845703, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6929462552070618, + "num_tokens": 118591213.0, + "step": 4584 + }, + { + "epoch": 0.5035141664836371, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2776479721069336, + "learning_rate": 1e-06, + "loss": 1.1558, + "mean_token_accuracy": 0.669905960559845, + "num_tokens": 118621747.0, + "step": 4585 + }, + { + "epoch": 0.5036239841862509, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.8635402917861938, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6936612725257874, + "num_tokens": 118660496.0, + "step": 4586 + }, + { + "epoch": 0.5037338018888645, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.517115831375122, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7112948894500732, + "num_tokens": 118682445.0, + "step": 4587 + }, + { + "epoch": 0.5038436195914782, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.207371711730957, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6999760866165161, + "num_tokens": 118709958.0, + "step": 4588 + }, + { + "epoch": 0.5039534372940918, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.29582142829895, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7048017978668213, + "num_tokens": 118736291.0, + "step": 4589 + }, + { + "epoch": 0.5040632549967055, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 3.759504795074463, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6865527629852295, + "num_tokens": 118763755.0, + "step": 4590 + }, + { + "epoch": 0.5041730726993191, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5097908973693848, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7089971303939819, + "num_tokens": 118785592.0, + "step": 4591 + }, + { + "epoch": 0.5042828904019327, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1009256839752197, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6975677013397217, + "num_tokens": 118815809.0, + "step": 4592 + }, + { + "epoch": 0.5043927081045465, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1073367595672607, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7007830739021301, + "num_tokens": 118844118.0, + "step": 4593 + }, + { + "epoch": 0.5045025258071602, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0199365615844727, + "learning_rate": 1e-06, + "loss": 1.167, + "mean_token_accuracy": 0.6588159799575806, + "num_tokens": 118878278.0, + "step": 4594 + }, + { + "epoch": 0.5046123435097738, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.312527656555176, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6916837096214294, + "num_tokens": 118904888.0, + "step": 4595 + }, + { + "epoch": 0.5047221612123874, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.0949385166168213, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6990492939949036, + "num_tokens": 118936516.0, + "step": 4596 + }, + { + "epoch": 0.5048319789150011, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.09824538230896, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6862934827804565, + "num_tokens": 118965982.0, + "step": 4597 + }, + { + "epoch": 0.5049417966176147, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.430657148361206, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7142927050590515, + "num_tokens": 118989660.0, + "step": 4598 + }, + { + "epoch": 0.5050516143202284, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.4482390880584717, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7109830975532532, + "num_tokens": 119014512.0, + "step": 4599 + }, + { + "epoch": 0.505161432022842, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 3.719055652618408, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6783653497695923, + "num_tokens": 119046114.0, + "step": 4600 + }, + { + "epoch": 0.5052712497254558, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7362253665924072, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6969287395477295, + "num_tokens": 119069967.0, + "step": 4601 + }, + { + "epoch": 0.5053810674280694, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2582972049713135, + "learning_rate": 1e-06, + "loss": 1.1089, + "mean_token_accuracy": 0.6783238649368286, + "num_tokens": 119102712.0, + "step": 4602 + }, + { + "epoch": 0.5054908851306831, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.279343605041504, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6967400312423706, + "num_tokens": 119131270.0, + "step": 4603 + }, + { + "epoch": 0.5056007028332967, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 3.0921430587768555, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7158191204071045, + "num_tokens": 119147580.0, + "step": 4604 + }, + { + "epoch": 0.5057105205359104, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.1988365650177, + "learning_rate": 1e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6779724359512329, + "num_tokens": 119176404.0, + "step": 4605 + }, + { + "epoch": 0.505820338238524, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3214540481567383, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6961638927459717, + "num_tokens": 119203393.0, + "step": 4606 + }, + { + "epoch": 0.5059301559411377, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.256166458129883, + "learning_rate": 1e-06, + "loss": 1.124, + "mean_token_accuracy": 0.6735639572143555, + "num_tokens": 119231833.0, + "step": 4607 + }, + { + "epoch": 0.5060399736437514, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4786856174468994, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.699055552482605, + "num_tokens": 119256386.0, + "step": 4608 + }, + { + "epoch": 0.5061497913463651, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2030746936798096, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.680717945098877, + "num_tokens": 119285024.0, + "step": 4609 + }, + { + "epoch": 0.5062596090489787, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3929378986358643, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7192484736442566, + "num_tokens": 119308733.0, + "step": 4610 + }, + { + "epoch": 0.5063694267515924, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5798351764678955, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6923421621322632, + "num_tokens": 119332389.0, + "step": 4611 + }, + { + "epoch": 0.506479244454206, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.258676528930664, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6914889216423035, + "num_tokens": 119360140.0, + "step": 4612 + }, + { + "epoch": 0.5065890621568196, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.63744854927063, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.7173436284065247, + "num_tokens": 119381098.0, + "step": 4613 + }, + { + "epoch": 0.5066988798594333, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.3948874473571777, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7259422540664673, + "num_tokens": 119406309.0, + "step": 4614 + }, + { + "epoch": 0.506808697562047, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 2.465120792388916, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7223706245422363, + "num_tokens": 119429642.0, + "step": 4615 + }, + { + "epoch": 0.5069185152646607, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0914461612701416, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6905316114425659, + "num_tokens": 119461475.0, + "step": 4616 + }, + { + "epoch": 0.5070283329672743, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4238204956054688, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7094653248786926, + "num_tokens": 119485600.0, + "step": 4617 + }, + { + "epoch": 0.507138150669888, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.334343194961548, + "learning_rate": 1e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.6814128160476685, + "num_tokens": 119511824.0, + "step": 4618 + }, + { + "epoch": 0.5072479683725016, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5984044075012207, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7065045833587646, + "num_tokens": 119533133.0, + "step": 4619 + }, + { + "epoch": 0.5073577860751153, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4547767639160156, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7002105712890625, + "num_tokens": 119557080.0, + "step": 4620 + }, + { + "epoch": 0.5074676037777289, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.592258930206299, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.689799964427948, + "num_tokens": 119577569.0, + "step": 4621 + }, + { + "epoch": 0.5075774214803427, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3423633575439453, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6903148889541626, + "num_tokens": 119605645.0, + "step": 4622 + }, + { + "epoch": 0.5076872391829563, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5983526706695557, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.705115556716919, + "num_tokens": 119626742.0, + "step": 4623 + }, + { + "epoch": 0.50779705688557, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.537339210510254, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6988217830657959, + "num_tokens": 119648331.0, + "step": 4624 + }, + { + "epoch": 0.5079068745881836, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.407048463821411, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7062480449676514, + "num_tokens": 119671073.0, + "step": 4625 + }, + { + "epoch": 0.5080166922907973, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3062491416931152, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7036261558532715, + "num_tokens": 119697145.0, + "step": 4626 + }, + { + "epoch": 0.5081265099934109, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.602065324783325, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6907414197921753, + "num_tokens": 119719279.0, + "step": 4627 + }, + { + "epoch": 0.5082363276960246, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2028250694274902, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7069491744041443, + "num_tokens": 119746924.0, + "step": 4628 + }, + { + "epoch": 0.5083461453986382, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.680025339126587, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6797817945480347, + "num_tokens": 119769037.0, + "step": 4629 + }, + { + "epoch": 0.508455963101252, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.170078992843628, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6855283379554749, + "num_tokens": 119797640.0, + "step": 4630 + }, + { + "epoch": 0.5085657808038656, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.286184072494507, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7119501829147339, + "num_tokens": 119821460.0, + "step": 4631 + }, + { + "epoch": 0.5086755985064793, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.270557403564453, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6839280128479004, + "num_tokens": 119850542.0, + "step": 4632 + }, + { + "epoch": 0.5087854162090929, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2008955478668213, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.704301118850708, + "num_tokens": 119877816.0, + "step": 4633 + }, + { + "epoch": 0.5088952339117065, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1968510150909424, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6818400621414185, + "num_tokens": 119907607.0, + "step": 4634 + }, + { + "epoch": 0.5090050516143202, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2874386310577393, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.696496307849884, + "num_tokens": 119933810.0, + "step": 4635 + }, + { + "epoch": 0.5091148693169338, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4073052406311035, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6771847009658813, + "num_tokens": 119960256.0, + "step": 4636 + }, + { + "epoch": 0.5092246870195476, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2122015953063965, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6993787288665771, + "num_tokens": 119988437.0, + "step": 4637 + }, + { + "epoch": 0.5093345047221612, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3366875648498535, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.701695442199707, + "num_tokens": 120012553.0, + "step": 4638 + }, + { + "epoch": 0.5094443224247749, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.369473695755005, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6972375512123108, + "num_tokens": 120037367.0, + "step": 4639 + }, + { + "epoch": 0.5095541401273885, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.6630280017852783, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7198069095611572, + "num_tokens": 120056816.0, + "step": 4640 + }, + { + "epoch": 0.5096639578300022, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1305413246154785, + "learning_rate": 1e-06, + "loss": 1.0912, + "mean_token_accuracy": 0.6810213327407837, + "num_tokens": 120090690.0, + "step": 4641 + }, + { + "epoch": 0.5097737755326158, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.467219114303589, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7030097842216492, + "num_tokens": 120114819.0, + "step": 4642 + }, + { + "epoch": 0.5098835932352295, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2433884143829346, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7046303749084473, + "num_tokens": 120144814.0, + "step": 4643 + }, + { + "epoch": 0.5099934109378432, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.365819215774536, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7185139060020447, + "num_tokens": 120168876.0, + "step": 4644 + }, + { + "epoch": 0.5101032286404569, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.36751651763916, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6787588596343994, + "num_tokens": 120196920.0, + "step": 4645 + }, + { + "epoch": 0.5102130463430705, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4209868907928467, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6958272457122803, + "num_tokens": 120222102.0, + "step": 4646 + }, + { + "epoch": 0.5103228640456842, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.329911947250366, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7014597058296204, + "num_tokens": 120247712.0, + "step": 4647 + }, + { + "epoch": 0.5104326817482978, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0777859687805176, + "learning_rate": 1e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6878865957260132, + "num_tokens": 120281315.0, + "step": 4648 + }, + { + "epoch": 0.5105424994509115, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2394113540649414, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.7021144032478333, + "num_tokens": 120309377.0, + "step": 4649 + }, + { + "epoch": 0.5106523171535251, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2045154571533203, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6855137944221497, + "num_tokens": 120337949.0, + "step": 4650 + }, + { + "epoch": 0.5107621348561389, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3180437088012695, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7049127221107483, + "num_tokens": 120362736.0, + "step": 4651 + }, + { + "epoch": 0.5108719525587525, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2109570503234863, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6897121667861938, + "num_tokens": 120394003.0, + "step": 4652 + }, + { + "epoch": 0.5109817702613662, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.509734869003296, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6909737586975098, + "num_tokens": 120417078.0, + "step": 4653 + }, + { + "epoch": 0.5110915879639798, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3410887718200684, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7208608388900757, + "num_tokens": 120440605.0, + "step": 4654 + }, + { + "epoch": 0.5112014056665934, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.589289665222168, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7050877809524536, + "num_tokens": 120463428.0, + "step": 4655 + }, + { + "epoch": 0.5113112233692071, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.235450506210327, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7015284299850464, + "num_tokens": 120493073.0, + "step": 4656 + }, + { + "epoch": 0.5114210410718207, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4373862743377686, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6981551647186279, + "num_tokens": 120518143.0, + "step": 4657 + }, + { + "epoch": 0.5115308587744345, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.438368797302246, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7027989029884338, + "num_tokens": 120542359.0, + "step": 4658 + }, + { + "epoch": 0.5116406764770481, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.308958053588867, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7011386156082153, + "num_tokens": 120571395.0, + "step": 4659 + }, + { + "epoch": 0.5117504941796618, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0648481845855713, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6799089908599854, + "num_tokens": 120607912.0, + "step": 4660 + }, + { + "epoch": 0.5118603118822754, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1256024837493896, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6980199217796326, + "num_tokens": 120639223.0, + "step": 4661 + }, + { + "epoch": 0.5119701295848891, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.607574462890625, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7055746912956238, + "num_tokens": 120659595.0, + "step": 4662 + }, + { + "epoch": 0.5120799472875027, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2112696170806885, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6853700280189514, + "num_tokens": 120694465.0, + "step": 4663 + }, + { + "epoch": 0.5121897649901164, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.298842668533325, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6958929300308228, + "num_tokens": 120722417.0, + "step": 4664 + }, + { + "epoch": 0.51229958269273, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2854816913604736, + "learning_rate": 1e-06, + "loss": 1.1049, + "mean_token_accuracy": 0.6805369853973389, + "num_tokens": 120750601.0, + "step": 4665 + }, + { + "epoch": 0.5124094003953438, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.221435546875, + "learning_rate": 1e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6753905415534973, + "num_tokens": 120781378.0, + "step": 4666 + }, + { + "epoch": 0.5125192180979574, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.372384548187256, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6920086741447449, + "num_tokens": 120807616.0, + "step": 4667 + }, + { + "epoch": 0.5126290358005711, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.413012742996216, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6931324601173401, + "num_tokens": 120832422.0, + "step": 4668 + }, + { + "epoch": 0.5127388535031847, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2681822776794434, + "learning_rate": 1e-06, + "loss": 1.0999, + "mean_token_accuracy": 0.6748956441879272, + "num_tokens": 120859714.0, + "step": 4669 + }, + { + "epoch": 0.5128486712057984, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.544161081314087, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.711341142654419, + "num_tokens": 120883835.0, + "step": 4670 + }, + { + "epoch": 0.512958488908412, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4144372940063477, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6939256191253662, + "num_tokens": 120907148.0, + "step": 4671 + }, + { + "epoch": 0.5130683066110256, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1435797214508057, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6999906301498413, + "num_tokens": 120937197.0, + "step": 4672 + }, + { + "epoch": 0.5131781243136394, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4080026149749756, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7093404531478882, + "num_tokens": 120960227.0, + "step": 4673 + }, + { + "epoch": 0.513287942016253, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.266497850418091, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6952688694000244, + "num_tokens": 120987671.0, + "step": 4674 + }, + { + "epoch": 0.5133977597188667, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3493430614471436, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7152116298675537, + "num_tokens": 121014518.0, + "step": 4675 + }, + { + "epoch": 0.5135075774214803, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2484166622161865, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6873090863227844, + "num_tokens": 121046518.0, + "step": 4676 + }, + { + "epoch": 0.513617395124094, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4426355361938477, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7043783664703369, + "num_tokens": 121070381.0, + "step": 4677 + }, + { + "epoch": 0.5137272128267076, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3201119899749756, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7050855159759521, + "num_tokens": 121093702.0, + "step": 4678 + }, + { + "epoch": 0.5138370305293213, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.8231701850891113, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6937025189399719, + "num_tokens": 121114919.0, + "step": 4679 + }, + { + "epoch": 0.513946848231935, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4600348472595215, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6965908408164978, + "num_tokens": 121137886.0, + "step": 4680 + }, + { + "epoch": 0.5140566659345487, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.349689245223999, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6997778415679932, + "num_tokens": 121164767.0, + "step": 4681 + }, + { + "epoch": 0.5141664836371623, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1866912841796875, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6908462643623352, + "num_tokens": 121194080.0, + "step": 4682 + }, + { + "epoch": 0.514276301339776, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.077530860900879, + "learning_rate": 1e-06, + "loss": 1.1422, + "mean_token_accuracy": 0.6663826704025269, + "num_tokens": 121226531.0, + "step": 4683 + }, + { + "epoch": 0.5143861190423896, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.46199631690979, + "learning_rate": 1e-06, + "loss": 1.1295, + "mean_token_accuracy": 0.6734148263931274, + "num_tokens": 121254072.0, + "step": 4684 + }, + { + "epoch": 0.5144959367450033, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.43357253074646, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.69890958070755, + "num_tokens": 121279731.0, + "step": 4685 + }, + { + "epoch": 0.5146057544476169, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.9980506896972656, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7166056036949158, + "num_tokens": 121312266.0, + "step": 4686 + }, + { + "epoch": 0.5147155721502307, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.338517904281616, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6969156265258789, + "num_tokens": 121337329.0, + "step": 4687 + }, + { + "epoch": 0.5148253898528443, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0950615406036377, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6913544535636902, + "num_tokens": 121369123.0, + "step": 4688 + }, + { + "epoch": 0.514935207555458, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5141124725341797, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7042807936668396, + "num_tokens": 121391602.0, + "step": 4689 + }, + { + "epoch": 0.5150450252580716, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2770347595214844, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6836487650871277, + "num_tokens": 121418720.0, + "step": 4690 + }, + { + "epoch": 0.5151548429606853, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.252101421356201, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7168450951576233, + "num_tokens": 121444116.0, + "step": 4691 + }, + { + "epoch": 0.5152646606632989, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.142003059387207, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6969181299209595, + "num_tokens": 121474478.0, + "step": 4692 + }, + { + "epoch": 0.5153744783659125, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.236215353012085, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6947885751724243, + "num_tokens": 121502744.0, + "step": 4693 + }, + { + "epoch": 0.5154842960685262, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.24135684967041, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7205119729042053, + "num_tokens": 121530615.0, + "step": 4694 + }, + { + "epoch": 0.51559411377114, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.185382604598999, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6987704634666443, + "num_tokens": 121559108.0, + "step": 4695 + }, + { + "epoch": 0.5157039314737536, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2034220695495605, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.7055778503417969, + "num_tokens": 121588392.0, + "step": 4696 + }, + { + "epoch": 0.5158137491763672, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.126950740814209, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7066912055015564, + "num_tokens": 121617083.0, + "step": 4697 + }, + { + "epoch": 0.5159235668789809, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4854025840759277, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7111498713493347, + "num_tokens": 121641301.0, + "step": 4698 + }, + { + "epoch": 0.5160333845815945, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.137777328491211, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7036599516868591, + "num_tokens": 121671011.0, + "step": 4699 + }, + { + "epoch": 0.5161432022842082, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3088324069976807, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7124593257904053, + "num_tokens": 121697278.0, + "step": 4700 + }, + { + "epoch": 0.5162530199868218, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2075655460357666, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6995840072631836, + "num_tokens": 121724693.0, + "step": 4701 + }, + { + "epoch": 0.5163628376894356, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2691726684570312, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6966999173164368, + "num_tokens": 121753616.0, + "step": 4702 + }, + { + "epoch": 0.5164726553920492, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2197060585021973, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7046979665756226, + "num_tokens": 121781118.0, + "step": 4703 + }, + { + "epoch": 0.5165824730946629, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.217231035232544, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7184354066848755, + "num_tokens": 121807666.0, + "step": 4704 + }, + { + "epoch": 0.5166922907972765, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.619493246078491, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7296767830848694, + "num_tokens": 121826186.0, + "step": 4705 + }, + { + "epoch": 0.5168021084998902, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.386026620864868, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7067599296569824, + "num_tokens": 121853749.0, + "step": 4706 + }, + { + "epoch": 0.5169119262025038, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5065598487854004, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6956072449684143, + "num_tokens": 121879214.0, + "step": 4707 + }, + { + "epoch": 0.5170217439051175, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.322394371032715, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.7025419473648071, + "num_tokens": 121905782.0, + "step": 4708 + }, + { + "epoch": 0.5171315616077312, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1782331466674805, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6881090402603149, + "num_tokens": 121935314.0, + "step": 4709 + }, + { + "epoch": 0.5172413793103449, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.097593307495117, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7125332951545715, + "num_tokens": 121965963.0, + "step": 4710 + }, + { + "epoch": 0.5173511970129585, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.157623291015625, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7057271003723145, + "num_tokens": 121994548.0, + "step": 4711 + }, + { + "epoch": 0.5174610147155722, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3367340564727783, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7145577073097229, + "num_tokens": 122021964.0, + "step": 4712 + }, + { + "epoch": 0.5175708324181858, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7769293785095215, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7119383215904236, + "num_tokens": 122043758.0, + "step": 4713 + }, + { + "epoch": 0.5176806501207994, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3894667625427246, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7114956378936768, + "num_tokens": 122068302.0, + "step": 4714 + }, + { + "epoch": 0.5177904678234131, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.332526206970215, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.702966034412384, + "num_tokens": 122095220.0, + "step": 4715 + }, + { + "epoch": 0.5179002855260268, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4690189361572266, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6990399956703186, + "num_tokens": 122117264.0, + "step": 4716 + }, + { + "epoch": 0.5180101032286405, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5950238704681396, + "learning_rate": 1e-06, + "loss": 1.1441, + "mean_token_accuracy": 0.6583767533302307, + "num_tokens": 122140503.0, + "step": 4717 + }, + { + "epoch": 0.5181199209312541, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4741318225860596, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6968408823013306, + "num_tokens": 122162514.0, + "step": 4718 + }, + { + "epoch": 0.5182297386338678, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5284290313720703, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6940227746963501, + "num_tokens": 122184999.0, + "step": 4719 + }, + { + "epoch": 0.5183395563364814, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2946858406066895, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7096392512321472, + "num_tokens": 122213918.0, + "step": 4720 + }, + { + "epoch": 0.5184493740390951, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4914209842681885, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7055649161338806, + "num_tokens": 122237734.0, + "step": 4721 + }, + { + "epoch": 0.5185591917417087, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4753682613372803, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6981009244918823, + "num_tokens": 122260160.0, + "step": 4722 + }, + { + "epoch": 0.5186690094443224, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2632782459259033, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.717370867729187, + "num_tokens": 122286833.0, + "step": 4723 + }, + { + "epoch": 0.5187788271469361, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.300415515899658, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7331628203392029, + "num_tokens": 122310433.0, + "step": 4724 + }, + { + "epoch": 0.5188886448495498, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4833743572235107, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6958062648773193, + "num_tokens": 122334006.0, + "step": 4725 + }, + { + "epoch": 0.5189984625521634, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.335275411605835, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7037715315818787, + "num_tokens": 122361228.0, + "step": 4726 + }, + { + "epoch": 0.5191082802547771, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.334392547607422, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7207604646682739, + "num_tokens": 122386834.0, + "step": 4727 + }, + { + "epoch": 0.5192180979573907, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.456540107727051, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6962752342224121, + "num_tokens": 122411805.0, + "step": 4728 + }, + { + "epoch": 0.5193279156600044, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.0890438556671143, + "learning_rate": 1e-06, + "loss": 1.137, + "mean_token_accuracy": 0.6755713820457458, + "num_tokens": 122446482.0, + "step": 4729 + }, + { + "epoch": 0.519437733362618, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3572661876678467, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7051000595092773, + "num_tokens": 122473380.0, + "step": 4730 + }, + { + "epoch": 0.5195475510652318, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5178990364074707, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7178215384483337, + "num_tokens": 122497542.0, + "step": 4731 + }, + { + "epoch": 0.5196573687678454, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2518386840820312, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6895025968551636, + "num_tokens": 122526094.0, + "step": 4732 + }, + { + "epoch": 0.519767186470459, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.221200942993164, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6877569556236267, + "num_tokens": 122555978.0, + "step": 4733 + }, + { + "epoch": 0.5198770041730727, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2318334579467773, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7018547058105469, + "num_tokens": 122584360.0, + "step": 4734 + }, + { + "epoch": 0.5199868218756863, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5295283794403076, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6932197213172913, + "num_tokens": 122606846.0, + "step": 4735 + }, + { + "epoch": 0.5200966395783, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7215938568115234, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7162928581237793, + "num_tokens": 122626112.0, + "step": 4736 + }, + { + "epoch": 0.5202064572809136, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4702868461608887, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6892434358596802, + "num_tokens": 122650732.0, + "step": 4737 + }, + { + "epoch": 0.5203162749835274, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4600157737731934, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7106349468231201, + "num_tokens": 122674126.0, + "step": 4738 + }, + { + "epoch": 0.520426092686141, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.537825107574463, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7086841464042664, + "num_tokens": 122697488.0, + "step": 4739 + }, + { + "epoch": 0.5205359103887547, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4813778400421143, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6991899013519287, + "num_tokens": 122722311.0, + "step": 4740 + }, + { + "epoch": 0.5206457280913683, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3791935443878174, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7068519592285156, + "num_tokens": 122748210.0, + "step": 4741 + }, + { + "epoch": 0.520755545793982, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.466458320617676, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6845002770423889, + "num_tokens": 122772245.0, + "step": 4742 + }, + { + "epoch": 0.5208653634965956, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.306648015975952, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6977294683456421, + "num_tokens": 122798903.0, + "step": 4743 + }, + { + "epoch": 0.5209751811992093, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7826266288757324, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6972683668136597, + "num_tokens": 122818671.0, + "step": 4744 + }, + { + "epoch": 0.521084998901823, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3038806915283203, + "learning_rate": 1e-06, + "loss": 1.1684, + "mean_token_accuracy": 0.6653969883918762, + "num_tokens": 122849140.0, + "step": 4745 + }, + { + "epoch": 0.5211948166044367, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2084364891052246, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6928387880325317, + "num_tokens": 122879171.0, + "step": 4746 + }, + { + "epoch": 0.5213046343070503, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2678256034851074, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6961400508880615, + "num_tokens": 122908443.0, + "step": 4747 + }, + { + "epoch": 0.521414452009664, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.519070863723755, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7155240774154663, + "num_tokens": 122928803.0, + "step": 4748 + }, + { + "epoch": 0.5215242697122776, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.431858539581299, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6986212730407715, + "num_tokens": 122954480.0, + "step": 4749 + }, + { + "epoch": 0.5216340874148913, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.351017475128174, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7102109789848328, + "num_tokens": 122982025.0, + "step": 4750 + }, + { + "epoch": 0.5217439051175049, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5784573554992676, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7045453786849976, + "num_tokens": 123009122.0, + "step": 4751 + }, + { + "epoch": 0.5218537228201185, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.473541498184204, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.722541332244873, + "num_tokens": 123034418.0, + "step": 4752 + }, + { + "epoch": 0.5219635405227323, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7009966373443604, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7010387778282166, + "num_tokens": 123055576.0, + "step": 4753 + }, + { + "epoch": 0.522073358225346, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.889543056488037, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7023733854293823, + "num_tokens": 123074540.0, + "step": 4754 + }, + { + "epoch": 0.5221831759279596, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.573885679244995, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7040687799453735, + "num_tokens": 123097449.0, + "step": 4755 + }, + { + "epoch": 0.5222929936305732, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3519744873046875, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6893321871757507, + "num_tokens": 123125290.0, + "step": 4756 + }, + { + "epoch": 0.5224028113331869, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.144667863845825, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6958513855934143, + "num_tokens": 123155920.0, + "step": 4757 + }, + { + "epoch": 0.5225126290358005, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.478848934173584, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7166764140129089, + "num_tokens": 123176959.0, + "step": 4758 + }, + { + "epoch": 0.5226224467384142, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.366335153579712, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6893596649169922, + "num_tokens": 123202287.0, + "step": 4759 + }, + { + "epoch": 0.5227322644410279, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.535083293914795, + "learning_rate": 1e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6858991384506226, + "num_tokens": 123225282.0, + "step": 4760 + }, + { + "epoch": 0.5228420821436416, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.515854835510254, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6792789697647095, + "num_tokens": 123251470.0, + "step": 4761 + }, + { + "epoch": 0.5229518998462552, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.1788878440856934, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.684813916683197, + "num_tokens": 123280914.0, + "step": 4762 + }, + { + "epoch": 0.5230617175488689, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3502414226531982, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7087873220443726, + "num_tokens": 123307373.0, + "step": 4763 + }, + { + "epoch": 0.5231715352514825, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.131497383117676, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6788284778594971, + "num_tokens": 123341593.0, + "step": 4764 + }, + { + "epoch": 0.5232813529540962, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3878543376922607, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7097296714782715, + "num_tokens": 123367583.0, + "step": 4765 + }, + { + "epoch": 0.5233911706567098, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.9861303567886353, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6869685649871826, + "num_tokens": 123404988.0, + "step": 4766 + }, + { + "epoch": 0.5235009883593236, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.255946397781372, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7116308212280273, + "num_tokens": 123429995.0, + "step": 4767 + }, + { + "epoch": 0.5236108060619372, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2672481536865234, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.692682147026062, + "num_tokens": 123458296.0, + "step": 4768 + }, + { + "epoch": 0.5237206237645509, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4115397930145264, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6984921097755432, + "num_tokens": 123484696.0, + "step": 4769 + }, + { + "epoch": 0.5238304414671645, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.48728346824646, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6962978839874268, + "num_tokens": 123511954.0, + "step": 4770 + }, + { + "epoch": 0.5239402591697782, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.782134532928467, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.715172529220581, + "num_tokens": 123532302.0, + "step": 4771 + }, + { + "epoch": 0.5240500768723918, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5069379806518555, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7059982419013977, + "num_tokens": 123557394.0, + "step": 4772 + }, + { + "epoch": 0.5241598945750054, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5782663822174072, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6901315450668335, + "num_tokens": 123581035.0, + "step": 4773 + }, + { + "epoch": 0.5242697122776192, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.261273145675659, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.700042724609375, + "num_tokens": 123609927.0, + "step": 4774 + }, + { + "epoch": 0.5243795299802329, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.442431926727295, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.681405782699585, + "num_tokens": 123636801.0, + "step": 4775 + }, + { + "epoch": 0.5244893476828465, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3021557331085205, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7226648926734924, + "num_tokens": 123662412.0, + "step": 4776 + }, + { + "epoch": 0.5245991653854601, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.431114912033081, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6873106956481934, + "num_tokens": 123688200.0, + "step": 4777 + }, + { + "epoch": 0.5247089830880738, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.8133695125579834, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6998619437217712, + "num_tokens": 123707255.0, + "step": 4778 + }, + { + "epoch": 0.5248188007906874, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5059492588043213, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7087985277175903, + "num_tokens": 123730727.0, + "step": 4779 + }, + { + "epoch": 0.5249286184933011, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.624920606613159, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7405962944030762, + "num_tokens": 123750769.0, + "step": 4780 + }, + { + "epoch": 0.5250384361959147, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3533124923706055, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7159339189529419, + "num_tokens": 123774443.0, + "step": 4781 + }, + { + "epoch": 0.5251482538985285, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.526883840560913, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6985346674919128, + "num_tokens": 123797174.0, + "step": 4782 + }, + { + "epoch": 0.5252580716011421, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3933486938476562, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6959696412086487, + "num_tokens": 123823711.0, + "step": 4783 + }, + { + "epoch": 0.5253678893037558, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.504026174545288, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6884630918502808, + "num_tokens": 123847343.0, + "step": 4784 + }, + { + "epoch": 0.5254777070063694, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.409803628921509, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6830806732177734, + "num_tokens": 123872690.0, + "step": 4785 + }, + { + "epoch": 0.5255875247089831, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4118990898132324, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.700056791305542, + "num_tokens": 123897410.0, + "step": 4786 + }, + { + "epoch": 0.5256973424115967, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.52891206741333, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6917643547058105, + "num_tokens": 123921194.0, + "step": 4787 + }, + { + "epoch": 0.5258071601142104, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3700947761535645, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7058196067810059, + "num_tokens": 123946528.0, + "step": 4788 + }, + { + "epoch": 0.5259169778168241, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.306044578552246, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.702191948890686, + "num_tokens": 123974449.0, + "step": 4789 + }, + { + "epoch": 0.5260267955194378, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5534369945526123, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7174826860427856, + "num_tokens": 123994838.0, + "step": 4790 + }, + { + "epoch": 0.5261366132220514, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2648158073425293, + "learning_rate": 1e-06, + "loss": 1.1524, + "mean_token_accuracy": 0.6628617644309998, + "num_tokens": 124025635.0, + "step": 4791 + }, + { + "epoch": 0.526246430924665, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4640567302703857, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.698790967464447, + "num_tokens": 124052257.0, + "step": 4792 + }, + { + "epoch": 0.5263562486272787, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4311256408691406, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7020962834358215, + "num_tokens": 124076722.0, + "step": 4793 + }, + { + "epoch": 0.5264660663298923, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.767226457595825, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6898877620697021, + "num_tokens": 124095514.0, + "step": 4794 + }, + { + "epoch": 0.526575884032506, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3027284145355225, + "learning_rate": 1e-06, + "loss": 1.0916, + "mean_token_accuracy": 0.6723183393478394, + "num_tokens": 124123063.0, + "step": 4795 + }, + { + "epoch": 0.5266857017351197, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4013195037841797, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6867368221282959, + "num_tokens": 124146795.0, + "step": 4796 + }, + { + "epoch": 0.5267955194377334, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.4278948307037354, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7052960395812988, + "num_tokens": 124170669.0, + "step": 4797 + }, + { + "epoch": 0.526905337140347, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3069348335266113, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.686765193939209, + "num_tokens": 124195844.0, + "step": 4798 + }, + { + "epoch": 0.5270151548429607, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3255391120910645, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6835393905639648, + "num_tokens": 124223053.0, + "step": 4799 + }, + { + "epoch": 0.5271249725455743, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.385647773742676, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6961116194725037, + "num_tokens": 124248218.0, + "step": 4800 + }, + { + "epoch": 0.527234790248188, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.3361384868621826, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7166671752929688, + "num_tokens": 124273859.0, + "step": 4801 + }, + { + "epoch": 0.5273446079508016, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.695564031600952, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7134634256362915, + "num_tokens": 124294213.0, + "step": 4802 + }, + { + "epoch": 0.5274544256534154, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.2207202911376953, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6973785758018494, + "num_tokens": 124319793.0, + "step": 4803 + }, + { + "epoch": 0.527564243356029, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.303213119506836, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7053995728492737, + "num_tokens": 124346866.0, + "step": 4804 + }, + { + "epoch": 0.5276740610586427, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.5937302112579346, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7047095894813538, + "num_tokens": 124368693.0, + "step": 4805 + }, + { + "epoch": 0.5277838787612563, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 2.7078309059143066, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6839749813079834, + "num_tokens": 124388582.0, + "step": 4806 + }, + { + "epoch": 0.52789369646387, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3778305053710938, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7125575542449951, + "num_tokens": 124413481.0, + "step": 4807 + }, + { + "epoch": 0.5280035141664836, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.425971031188965, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7075061798095703, + "num_tokens": 124436784.0, + "step": 4808 + }, + { + "epoch": 0.5281133318690973, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 2.445025682449341, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6844961643218994, + "num_tokens": 124460433.0, + "step": 4809 + }, + { + "epoch": 0.528223149571711, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4349753856658936, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6867477893829346, + "num_tokens": 124484380.0, + "step": 4810 + }, + { + "epoch": 0.5283329672743247, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3806395530700684, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7096204161643982, + "num_tokens": 124509622.0, + "step": 4811 + }, + { + "epoch": 0.5284427849769383, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1441829204559326, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6881538033485413, + "num_tokens": 124540590.0, + "step": 4812 + }, + { + "epoch": 0.528552602679552, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.759989023208618, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7061343789100647, + "num_tokens": 124562283.0, + "step": 4813 + }, + { + "epoch": 0.5286624203821656, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2711572647094727, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.681007981300354, + "num_tokens": 124594488.0, + "step": 4814 + }, + { + "epoch": 0.5287722380847792, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3487987518310547, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7097235321998596, + "num_tokens": 124621489.0, + "step": 4815 + }, + { + "epoch": 0.5288820557873929, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1485695838928223, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.739046037197113, + "num_tokens": 124649436.0, + "step": 4816 + }, + { + "epoch": 0.5289918734900065, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.5747129917144775, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7102622985839844, + "num_tokens": 124671012.0, + "step": 4817 + }, + { + "epoch": 0.5291016911926203, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.421628475189209, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6925430297851562, + "num_tokens": 124694958.0, + "step": 4818 + }, + { + "epoch": 0.5292115088952339, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.6617579460144043, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7042308449745178, + "num_tokens": 124715344.0, + "step": 4819 + }, + { + "epoch": 0.5293213265978476, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3822081089019775, + "learning_rate": 1e-06, + "loss": 1.1674, + "mean_token_accuracy": 0.6647051572799683, + "num_tokens": 124743751.0, + "step": 4820 + }, + { + "epoch": 0.5294311443004612, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.429698944091797, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7240995168685913, + "num_tokens": 124769264.0, + "step": 4821 + }, + { + "epoch": 0.5295409620030749, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3867812156677246, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6932846307754517, + "num_tokens": 124796623.0, + "step": 4822 + }, + { + "epoch": 0.5296507797056885, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.251622200012207, + "learning_rate": 1e-06, + "loss": 1.1257, + "mean_token_accuracy": 0.6706675291061401, + "num_tokens": 124825236.0, + "step": 4823 + }, + { + "epoch": 0.5297605974083022, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3831212520599365, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6927251219749451, + "num_tokens": 124849570.0, + "step": 4824 + }, + { + "epoch": 0.5298704151109159, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3087637424468994, + "learning_rate": 1e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6845154166221619, + "num_tokens": 124876140.0, + "step": 4825 + }, + { + "epoch": 0.5299802328135296, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.231355667114258, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6885756850242615, + "num_tokens": 124902546.0, + "step": 4826 + }, + { + "epoch": 0.5300900505161432, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4455978870391846, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.7005954384803772, + "num_tokens": 124926641.0, + "step": 4827 + }, + { + "epoch": 0.5301998682187569, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1892592906951904, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6988244652748108, + "num_tokens": 124955451.0, + "step": 4828 + }, + { + "epoch": 0.5303096859213705, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2507731914520264, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.711246132850647, + "num_tokens": 124983402.0, + "step": 4829 + }, + { + "epoch": 0.5304195036239842, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3418662548065186, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6985970735549927, + "num_tokens": 125008433.0, + "step": 4830 + }, + { + "epoch": 0.5305293213265978, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.160783052444458, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6974042654037476, + "num_tokens": 125036151.0, + "step": 4831 + }, + { + "epoch": 0.5306391390292116, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4837687015533447, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7193727493286133, + "num_tokens": 125058766.0, + "step": 4832 + }, + { + "epoch": 0.5307489567318252, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.332637071609497, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7125011086463928, + "num_tokens": 125084381.0, + "step": 4833 + }, + { + "epoch": 0.5308587744344389, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3888328075408936, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6845401525497437, + "num_tokens": 125108921.0, + "step": 4834 + }, + { + "epoch": 0.5309685921370525, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2045159339904785, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7003438472747803, + "num_tokens": 125140054.0, + "step": 4835 + }, + { + "epoch": 0.5310784098396661, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.265885829925537, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6840754151344299, + "num_tokens": 125167042.0, + "step": 4836 + }, + { + "epoch": 0.5311882275422798, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.140789031982422, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7135764956474304, + "num_tokens": 125195924.0, + "step": 4837 + }, + { + "epoch": 0.5312980452448934, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4054627418518066, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6965265870094299, + "num_tokens": 125221340.0, + "step": 4838 + }, + { + "epoch": 0.5314078629475072, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4585397243499756, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6957873106002808, + "num_tokens": 125243894.0, + "step": 4839 + }, + { + "epoch": 0.5315176806501208, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4256465435028076, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7102910876274109, + "num_tokens": 125268549.0, + "step": 4840 + }, + { + "epoch": 0.5316274983527345, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2942094802856445, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6953365802764893, + "num_tokens": 125295550.0, + "step": 4841 + }, + { + "epoch": 0.5317373160553481, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3222057819366455, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6985366344451904, + "num_tokens": 125320744.0, + "step": 4842 + }, + { + "epoch": 0.5318471337579618, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3511085510253906, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6842972636222839, + "num_tokens": 125348066.0, + "step": 4843 + }, + { + "epoch": 0.5319569514605754, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1275134086608887, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6933985948562622, + "num_tokens": 125378429.0, + "step": 4844 + }, + { + "epoch": 0.5320667691631891, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.480419397354126, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6922382116317749, + "num_tokens": 125403980.0, + "step": 4845 + }, + { + "epoch": 0.5321765868658027, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.111842155456543, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6917838454246521, + "num_tokens": 125434953.0, + "step": 4846 + }, + { + "epoch": 0.5322864045684165, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3838982582092285, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7006435394287109, + "num_tokens": 125461614.0, + "step": 4847 + }, + { + "epoch": 0.5323962222710301, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1682348251342773, + "learning_rate": 1e-06, + "loss": 1.1162, + "mean_token_accuracy": 0.6764277219772339, + "num_tokens": 125494269.0, + "step": 4848 + }, + { + "epoch": 0.5325060399736438, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.337449312210083, + "learning_rate": 1e-06, + "loss": 1.1165, + "mean_token_accuracy": 0.6706510186195374, + "num_tokens": 125523086.0, + "step": 4849 + }, + { + "epoch": 0.5326158576762574, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4698574542999268, + "learning_rate": 1e-06, + "loss": 1.0753, + "mean_token_accuracy": 0.6881687641143799, + "num_tokens": 125548116.0, + "step": 4850 + }, + { + "epoch": 0.532725675378871, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.341799020767212, + "learning_rate": 1e-06, + "loss": 1.1134, + "mean_token_accuracy": 0.6718424558639526, + "num_tokens": 125576092.0, + "step": 4851 + }, + { + "epoch": 0.5328354930814847, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3463797569274902, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.700516939163208, + "num_tokens": 125606678.0, + "step": 4852 + }, + { + "epoch": 0.5329453107840983, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.577953815460205, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7047646045684814, + "num_tokens": 125628470.0, + "step": 4853 + }, + { + "epoch": 0.5330551284867121, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.5689425468444824, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6899295449256897, + "num_tokens": 125649194.0, + "step": 4854 + }, + { + "epoch": 0.5331649461893258, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.247192144393921, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6870676875114441, + "num_tokens": 125681786.0, + "step": 4855 + }, + { + "epoch": 0.5332747638919394, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2763264179229736, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7046507000923157, + "num_tokens": 125708871.0, + "step": 4856 + }, + { + "epoch": 0.533384581594553, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3810489177703857, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7045745849609375, + "num_tokens": 125736407.0, + "step": 4857 + }, + { + "epoch": 0.5334943992971667, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3186686038970947, + "learning_rate": 1e-06, + "loss": 1.0835, + "mean_token_accuracy": 0.694573700428009, + "num_tokens": 125764625.0, + "step": 4858 + }, + { + "epoch": 0.5336042169997803, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4810895919799805, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6989029049873352, + "num_tokens": 125788811.0, + "step": 4859 + }, + { + "epoch": 0.533714034702394, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2261264324188232, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6936467885971069, + "num_tokens": 125816450.0, + "step": 4860 + }, + { + "epoch": 0.5338238524050077, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.37306809425354, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.7004146575927734, + "num_tokens": 125843742.0, + "step": 4861 + }, + { + "epoch": 0.5339336701076214, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.6768810749053955, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6853761672973633, + "num_tokens": 125867806.0, + "step": 4862 + }, + { + "epoch": 0.534043487810235, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.263024091720581, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6898664236068726, + "num_tokens": 125898897.0, + "step": 4863 + }, + { + "epoch": 0.5341533055128487, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.253931999206543, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7140502333641052, + "num_tokens": 125926333.0, + "step": 4864 + }, + { + "epoch": 0.5342631232154623, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.29939603805542, + "learning_rate": 1e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.6793613433837891, + "num_tokens": 125955239.0, + "step": 4865 + }, + { + "epoch": 0.534372940918076, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.160170316696167, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7092581391334534, + "num_tokens": 125984817.0, + "step": 4866 + }, + { + "epoch": 0.5344827586206896, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.6200151443481445, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.7041252851486206, + "num_tokens": 126007118.0, + "step": 4867 + }, + { + "epoch": 0.5345925763233034, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2949485778808594, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7006262540817261, + "num_tokens": 126033833.0, + "step": 4868 + }, + { + "epoch": 0.534702394025917, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.383357286453247, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6939926147460938, + "num_tokens": 126058969.0, + "step": 4869 + }, + { + "epoch": 0.5348122117285307, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.175055503845215, + "learning_rate": 1e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.6819711327552795, + "num_tokens": 126087601.0, + "step": 4870 + }, + { + "epoch": 0.5349220294311443, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.627826690673828, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6991998553276062, + "num_tokens": 126108863.0, + "step": 4871 + }, + { + "epoch": 0.535031847133758, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2493155002593994, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7111347317695618, + "num_tokens": 126135520.0, + "step": 4872 + }, + { + "epoch": 0.5351416648363716, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3642823696136475, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6993865370750427, + "num_tokens": 126159142.0, + "step": 4873 + }, + { + "epoch": 0.5352514825389852, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2053401470184326, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7106736898422241, + "num_tokens": 126185981.0, + "step": 4874 + }, + { + "epoch": 0.5353613002415989, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.379284143447876, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6933988332748413, + "num_tokens": 126211810.0, + "step": 4875 + }, + { + "epoch": 0.5354711179442126, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.334423542022705, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7024707198143005, + "num_tokens": 126236974.0, + "step": 4876 + }, + { + "epoch": 0.5355809356468263, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4172539710998535, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7007147073745728, + "num_tokens": 126261310.0, + "step": 4877 + }, + { + "epoch": 0.5356907533494399, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.737705707550049, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6913124918937683, + "num_tokens": 126281643.0, + "step": 4878 + }, + { + "epoch": 0.5358005710520536, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.162001371383667, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6870061159133911, + "num_tokens": 126312906.0, + "step": 4879 + }, + { + "epoch": 0.5359103887546672, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1006295680999756, + "learning_rate": 1e-06, + "loss": 1.1423, + "mean_token_accuracy": 0.6652849912643433, + "num_tokens": 126345375.0, + "step": 4880 + }, + { + "epoch": 0.5360202064572809, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.244746208190918, + "learning_rate": 1e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.678528368473053, + "num_tokens": 126376856.0, + "step": 4881 + }, + { + "epoch": 0.5361300241598945, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.398322343826294, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6966540813446045, + "num_tokens": 126401481.0, + "step": 4882 + }, + { + "epoch": 0.5362398418625083, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.7693419456481934, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.697422981262207, + "num_tokens": 126421653.0, + "step": 4883 + }, + { + "epoch": 0.5363496595651219, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.425403356552124, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6884458661079407, + "num_tokens": 126446537.0, + "step": 4884 + }, + { + "epoch": 0.5364594772677356, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.511274576187134, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7274879217147827, + "num_tokens": 126472324.0, + "step": 4885 + }, + { + "epoch": 0.5365692949703492, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.295022964477539, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7258715033531189, + "num_tokens": 126498138.0, + "step": 4886 + }, + { + "epoch": 0.5366791126729629, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3889377117156982, + "learning_rate": 1e-06, + "loss": 1.096, + "mean_token_accuracy": 0.6735681891441345, + "num_tokens": 126524152.0, + "step": 4887 + }, + { + "epoch": 0.5367889303755765, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.265007734298706, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6987224817276001, + "num_tokens": 126551253.0, + "step": 4888 + }, + { + "epoch": 0.5368987480781902, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1579630374908447, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6889907121658325, + "num_tokens": 126583556.0, + "step": 4889 + }, + { + "epoch": 0.5370085657808039, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.688180685043335, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7138675451278687, + "num_tokens": 126602628.0, + "step": 4890 + }, + { + "epoch": 0.5371183834834176, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1760778427124023, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7074258327484131, + "num_tokens": 126630945.0, + "step": 4891 + }, + { + "epoch": 0.5372282011860312, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1481528282165527, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.7015296220779419, + "num_tokens": 126660562.0, + "step": 4892 + }, + { + "epoch": 0.5373380188886449, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1445651054382324, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7011538743972778, + "num_tokens": 126691431.0, + "step": 4893 + }, + { + "epoch": 0.5374478365912585, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2562057971954346, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7016039490699768, + "num_tokens": 126718942.0, + "step": 4894 + }, + { + "epoch": 0.5375576542938721, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.844428062438965, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7269903421401978, + "num_tokens": 126737774.0, + "step": 4895 + }, + { + "epoch": 0.5376674719964858, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4573307037353516, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6954069137573242, + "num_tokens": 126764676.0, + "step": 4896 + }, + { + "epoch": 0.5377772896990995, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.6055331230163574, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6968384981155396, + "num_tokens": 126790046.0, + "step": 4897 + }, + { + "epoch": 0.5378871074017132, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.7031009197235107, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7349450588226318, + "num_tokens": 126808339.0, + "step": 4898 + }, + { + "epoch": 0.5379969251043268, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.7556707859039307, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7230880260467529, + "num_tokens": 126832548.0, + "step": 4899 + }, + { + "epoch": 0.5381067428069405, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.0597965717315674, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6926162242889404, + "num_tokens": 126864910.0, + "step": 4900 + }, + { + "epoch": 0.5382165605095541, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.2771434783935547, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6911988258361816, + "num_tokens": 126893270.0, + "step": 4901 + }, + { + "epoch": 0.5383263782121678, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.241952657699585, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6972357034683228, + "num_tokens": 126921753.0, + "step": 4902 + }, + { + "epoch": 0.5384361959147814, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.1448588371276855, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6856971383094788, + "num_tokens": 126955918.0, + "step": 4903 + }, + { + "epoch": 0.5385460136173951, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.540354013442993, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7074253559112549, + "num_tokens": 126978163.0, + "step": 4904 + }, + { + "epoch": 0.5386558313200088, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.515803575515747, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.7041218280792236, + "num_tokens": 127001935.0, + "step": 4905 + }, + { + "epoch": 0.5387656490226225, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.3760907649993896, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7038444876670837, + "num_tokens": 127026364.0, + "step": 4906 + }, + { + "epoch": 0.5388754667252361, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.5308470726013184, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6845674514770508, + "num_tokens": 127051022.0, + "step": 4907 + }, + { + "epoch": 0.5389852844278498, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.4069864749908447, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6952537298202515, + "num_tokens": 127077302.0, + "step": 4908 + }, + { + "epoch": 0.5390951021304634, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.2061965465545654, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6947696805000305, + "num_tokens": 127107594.0, + "step": 4909 + }, + { + "epoch": 0.5392049198330771, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.5892345905303955, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7035157680511475, + "num_tokens": 127129814.0, + "step": 4910 + }, + { + "epoch": 0.5393147375356907, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 2.665055274963379, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.714306116104126, + "num_tokens": 127148691.0, + "step": 4911 + }, + { + "epoch": 0.5394245552383045, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.6555092334747314, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7337253093719482, + "num_tokens": 127168865.0, + "step": 4912 + }, + { + "epoch": 0.5395343729409181, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.231919288635254, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6943569183349609, + "num_tokens": 127199139.0, + "step": 4913 + }, + { + "epoch": 0.5396441906435318, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.6467208862304688, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7071000337600708, + "num_tokens": 127219551.0, + "step": 4914 + }, + { + "epoch": 0.5397540083461454, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.228804349899292, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7047690153121948, + "num_tokens": 127247441.0, + "step": 4915 + }, + { + "epoch": 0.539863826048759, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.0286965370178223, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7059949040412903, + "num_tokens": 127280922.0, + "step": 4916 + }, + { + "epoch": 0.5399736437513727, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.407606840133667, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7214272022247314, + "num_tokens": 127303976.0, + "step": 4917 + }, + { + "epoch": 0.5400834614539863, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.279022455215454, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7217146158218384, + "num_tokens": 127329285.0, + "step": 4918 + }, + { + "epoch": 0.5401932791566001, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.509037733078003, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7182420492172241, + "num_tokens": 127352261.0, + "step": 4919 + }, + { + "epoch": 0.5403030968592137, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.2455711364746094, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7019233107566833, + "num_tokens": 127380086.0, + "step": 4920 + }, + { + "epoch": 0.5404129145618274, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.645616054534912, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7156028151512146, + "num_tokens": 127402010.0, + "step": 4921 + }, + { + "epoch": 0.540522732264441, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.460758686065674, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6889331936836243, + "num_tokens": 127429070.0, + "step": 4922 + }, + { + "epoch": 0.5406325499670547, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.184856653213501, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7092323303222656, + "num_tokens": 127458094.0, + "step": 4923 + }, + { + "epoch": 0.5407423676696683, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.1695396900177, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6996903419494629, + "num_tokens": 127487961.0, + "step": 4924 + }, + { + "epoch": 0.540852185372282, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.3056225776672363, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.700899600982666, + "num_tokens": 127514020.0, + "step": 4925 + }, + { + "epoch": 0.5409620030748957, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.179248571395874, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.690375804901123, + "num_tokens": 127542668.0, + "step": 4926 + }, + { + "epoch": 0.5410718207775094, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.577444076538086, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7032349109649658, + "num_tokens": 127562630.0, + "step": 4927 + }, + { + "epoch": 0.541181638480123, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.207451820373535, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7034949660301208, + "num_tokens": 127592931.0, + "step": 4928 + }, + { + "epoch": 0.5412914561827367, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.2333805561065674, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6899775862693787, + "num_tokens": 127621487.0, + "step": 4929 + }, + { + "epoch": 0.5414012738853503, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.116528272628784, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7270699739456177, + "num_tokens": 127650196.0, + "step": 4930 + }, + { + "epoch": 0.541511091587964, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.545090675354004, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.715710461139679, + "num_tokens": 127673970.0, + "step": 4931 + }, + { + "epoch": 0.5416209092905776, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.2343876361846924, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6851521730422974, + "num_tokens": 127703486.0, + "step": 4932 + }, + { + "epoch": 0.5417307269931912, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.1746435165405273, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.70341956615448, + "num_tokens": 127731693.0, + "step": 4933 + }, + { + "epoch": 0.541840544695805, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.393866539001465, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7225078344345093, + "num_tokens": 127754534.0, + "step": 4934 + }, + { + "epoch": 0.5419503623984187, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.128445625305176, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6954049468040466, + "num_tokens": 127785708.0, + "step": 4935 + }, + { + "epoch": 0.5420601801010323, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.5189857482910156, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7213876247406006, + "num_tokens": 127807870.0, + "step": 4936 + }, + { + "epoch": 0.5421699978036459, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.5459625720977783, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6927493810653687, + "num_tokens": 127830538.0, + "step": 4937 + }, + { + "epoch": 0.5422798155062596, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.1289966106414795, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6745866537094116, + "num_tokens": 127861733.0, + "step": 4938 + }, + { + "epoch": 0.5423896332088732, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.434382200241089, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6857296228408813, + "num_tokens": 127886190.0, + "step": 4939 + }, + { + "epoch": 0.5424994509114869, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.373328447341919, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6953479051589966, + "num_tokens": 127914203.0, + "step": 4940 + }, + { + "epoch": 0.5426092686141006, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.308927059173584, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6809202432632446, + "num_tokens": 127942783.0, + "step": 4941 + }, + { + "epoch": 0.5427190863167143, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.494295597076416, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6961452960968018, + "num_tokens": 127967032.0, + "step": 4942 + }, + { + "epoch": 0.5428289040193279, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.60520076751709, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.713645339012146, + "num_tokens": 127988188.0, + "step": 4943 + }, + { + "epoch": 0.5429387217219416, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.505115509033203, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.688485860824585, + "num_tokens": 128012088.0, + "step": 4944 + }, + { + "epoch": 0.5430485394245552, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.3583312034606934, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6964637041091919, + "num_tokens": 128037208.0, + "step": 4945 + }, + { + "epoch": 0.5431583571271689, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.345916748046875, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7291001081466675, + "num_tokens": 128062799.0, + "step": 4946 + }, + { + "epoch": 0.5432681748297825, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.2857110500335693, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7137526273727417, + "num_tokens": 128093529.0, + "step": 4947 + }, + { + "epoch": 0.5433779925323963, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.147611379623413, + "learning_rate": 1e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.6763314008712769, + "num_tokens": 128123690.0, + "step": 4948 + }, + { + "epoch": 0.5434878102350099, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.403700113296509, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6907570362091064, + "num_tokens": 128148433.0, + "step": 4949 + }, + { + "epoch": 0.5435976279376236, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.372234344482422, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6843709945678711, + "num_tokens": 128173520.0, + "step": 4950 + }, + { + "epoch": 0.5437074456402372, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.183759927749634, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6990300416946411, + "num_tokens": 128205469.0, + "step": 4951 + }, + { + "epoch": 0.5438172633428509, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.5032382011413574, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7262840270996094, + "num_tokens": 128228251.0, + "step": 4952 + }, + { + "epoch": 0.5439270810454645, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.222961187362671, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7105957269668579, + "num_tokens": 128255630.0, + "step": 4953 + }, + { + "epoch": 0.5440368987480781, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.470663547515869, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.706322431564331, + "num_tokens": 128278625.0, + "step": 4954 + }, + { + "epoch": 0.5441467164506919, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.4834320545196533, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7157880663871765, + "num_tokens": 128299970.0, + "step": 4955 + }, + { + "epoch": 0.5442565341533055, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.305479049682617, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7099587321281433, + "num_tokens": 128325057.0, + "step": 4956 + }, + { + "epoch": 0.5443663518559192, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.5145254135131836, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.698931097984314, + "num_tokens": 128348579.0, + "step": 4957 + }, + { + "epoch": 0.5444761695585328, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.323106527328491, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7104399800300598, + "num_tokens": 128374047.0, + "step": 4958 + }, + { + "epoch": 0.5445859872611465, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.7478578090667725, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7040203213691711, + "num_tokens": 128394609.0, + "step": 4959 + }, + { + "epoch": 0.5446958049637601, + "ewc_loss": 1.1742115020751953e-05, + "grad_norm": 2.335176706314087, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6983067989349365, + "num_tokens": 128419846.0, + "step": 4960 + }, + { + "epoch": 0.5448056226663738, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.996281623840332, + "learning_rate": 1e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.6753495931625366, + "num_tokens": 128453276.0, + "step": 4961 + }, + { + "epoch": 0.5449154403689875, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3944547176361084, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.696299135684967, + "num_tokens": 128476588.0, + "step": 4962 + }, + { + "epoch": 0.5450252580716012, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.2686851024627686, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6993077397346497, + "num_tokens": 128503964.0, + "step": 4963 + }, + { + "epoch": 0.5451350757742148, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.20147967338562, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7147546410560608, + "num_tokens": 128531796.0, + "step": 4964 + }, + { + "epoch": 0.5452448934768285, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.408402681350708, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7071448564529419, + "num_tokens": 128554987.0, + "step": 4965 + }, + { + "epoch": 0.5453547111794421, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.486560821533203, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6866248846054077, + "num_tokens": 128580111.0, + "step": 4966 + }, + { + "epoch": 0.5454645288820558, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.1024436950683594, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6751052141189575, + "num_tokens": 128612332.0, + "step": 4967 + }, + { + "epoch": 0.5455743465846694, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.5536792278289795, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6857871413230896, + "num_tokens": 128636853.0, + "step": 4968 + }, + { + "epoch": 0.5456841642872831, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.278024673461914, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.72076815366745, + "num_tokens": 128664436.0, + "step": 4969 + }, + { + "epoch": 0.5457939819898968, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.2561330795288086, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6837791800498962, + "num_tokens": 128694240.0, + "step": 4970 + }, + { + "epoch": 0.5459037996925105, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.4114620685577393, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6951718330383301, + "num_tokens": 128719182.0, + "step": 4971 + }, + { + "epoch": 0.5460136173951241, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.254042863845825, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6997359991073608, + "num_tokens": 128746729.0, + "step": 4972 + }, + { + "epoch": 0.5461234350977378, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.429839849472046, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6923928260803223, + "num_tokens": 128772278.0, + "step": 4973 + }, + { + "epoch": 0.5462332528003514, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.268381357192993, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6879535913467407, + "num_tokens": 128799048.0, + "step": 4974 + }, + { + "epoch": 0.546343070502965, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.244328498840332, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7011401653289795, + "num_tokens": 128825845.0, + "step": 4975 + }, + { + "epoch": 0.5464528882055787, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.328843593597412, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7039501667022705, + "num_tokens": 128854929.0, + "step": 4976 + }, + { + "epoch": 0.5465627059081924, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.5126729011535645, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7038208842277527, + "num_tokens": 128878255.0, + "step": 4977 + }, + { + "epoch": 0.5466725236108061, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.099499225616455, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6874946355819702, + "num_tokens": 128910671.0, + "step": 4978 + }, + { + "epoch": 0.5467823413134197, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.1607284545898438, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6897619962692261, + "num_tokens": 128941001.0, + "step": 4979 + }, + { + "epoch": 0.5468921590160334, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3151559829711914, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7162912487983704, + "num_tokens": 128965677.0, + "step": 4980 + }, + { + "epoch": 0.547001976718647, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3738231658935547, + "learning_rate": 1e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.6772340536117554, + "num_tokens": 128992259.0, + "step": 4981 + }, + { + "epoch": 0.5471117944212607, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.9857369661331177, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7129107117652893, + "num_tokens": 129026500.0, + "step": 4982 + }, + { + "epoch": 0.5472216121238743, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.269347906112671, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6759475469589233, + "num_tokens": 129055142.0, + "step": 4983 + }, + { + "epoch": 0.5473314298264881, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.067925453186035, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7032356858253479, + "num_tokens": 129087650.0, + "step": 4984 + }, + { + "epoch": 0.5474412475291017, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.057753324508667, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6979243755340576, + "num_tokens": 129117894.0, + "step": 4985 + }, + { + "epoch": 0.5475510652317154, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.287426233291626, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6948286294937134, + "num_tokens": 129144524.0, + "step": 4986 + }, + { + "epoch": 0.547660882934329, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.5546388626098633, + "learning_rate": 1e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6889733672142029, + "num_tokens": 129167161.0, + "step": 4987 + }, + { + "epoch": 0.5477707006369427, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.447739839553833, + "learning_rate": 1e-06, + "loss": 1.1254, + "mean_token_accuracy": 0.6769680976867676, + "num_tokens": 129192187.0, + "step": 4988 + }, + { + "epoch": 0.5478805183395563, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.308934211730957, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6983553171157837, + "num_tokens": 129220666.0, + "step": 4989 + }, + { + "epoch": 0.54799033604217, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.475318193435669, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7069052457809448, + "num_tokens": 129242823.0, + "step": 4990 + }, + { + "epoch": 0.5481001537447837, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.440579891204834, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6879386305809021, + "num_tokens": 129267928.0, + "step": 4991 + }, + { + "epoch": 0.5482099714473974, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.5158333778381348, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6866905093193054, + "num_tokens": 129292838.0, + "step": 4992 + }, + { + "epoch": 0.548319789150011, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.496854066848755, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7081325650215149, + "num_tokens": 129314953.0, + "step": 4993 + }, + { + "epoch": 0.5484296068526247, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3729166984558105, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7176135778427124, + "num_tokens": 129339660.0, + "step": 4994 + }, + { + "epoch": 0.5485394245552383, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.4561407566070557, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7126483917236328, + "num_tokens": 129363111.0, + "step": 4995 + }, + { + "epoch": 0.5486492422578519, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.375086784362793, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6842494010925293, + "num_tokens": 129390318.0, + "step": 4996 + }, + { + "epoch": 0.5487590599604656, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3392443656921387, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.691714346408844, + "num_tokens": 129416078.0, + "step": 4997 + }, + { + "epoch": 0.5488688776630792, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.718740463256836, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6873447299003601, + "num_tokens": 129437106.0, + "step": 4998 + }, + { + "epoch": 0.548978695365693, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.1010189056396484, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7017080187797546, + "num_tokens": 129469652.0, + "step": 4999 + }, + { + "epoch": 0.5490885130683066, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.258699893951416, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7079362869262695, + "num_tokens": 129496588.0, + "step": 5000 + }, + { + "epoch": 0.5491983307709203, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.3623313903808594, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7040570974349976, + "num_tokens": 129521502.0, + "step": 5001 + }, + { + "epoch": 0.5493081484735339, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.45453143119812, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7178242802619934, + "num_tokens": 129544748.0, + "step": 5002 + }, + { + "epoch": 0.5494179661761476, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 2.1822621822357178, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.699774444103241, + "num_tokens": 129574117.0, + "step": 5003 + }, + { + "epoch": 0.5495277838787612, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 3.011838674545288, + "learning_rate": 1e-06, + "loss": 1.1416, + "mean_token_accuracy": 0.6836884021759033, + "num_tokens": 129595211.0, + "step": 5004 + }, + { + "epoch": 0.5496376015813749, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5270578861236572, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6975813508033752, + "num_tokens": 129619967.0, + "step": 5005 + }, + { + "epoch": 0.5497474192839886, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3265154361724854, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.709470272064209, + "num_tokens": 129645294.0, + "step": 5006 + }, + { + "epoch": 0.5498572369866023, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.6248743534088135, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7019566297531128, + "num_tokens": 129664580.0, + "step": 5007 + }, + { + "epoch": 0.5499670546892159, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.304664373397827, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.691274106502533, + "num_tokens": 129691874.0, + "step": 5008 + }, + { + "epoch": 0.5500768723918296, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.305058240890503, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6846789121627808, + "num_tokens": 129719829.0, + "step": 5009 + }, + { + "epoch": 0.5501866900944432, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4053757190704346, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7261322736740112, + "num_tokens": 129743967.0, + "step": 5010 + }, + { + "epoch": 0.5502965077970569, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4495975971221924, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7163076400756836, + "num_tokens": 129767756.0, + "step": 5011 + }, + { + "epoch": 0.5504063254996705, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.40969181060791, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7058577537536621, + "num_tokens": 129790423.0, + "step": 5012 + }, + { + "epoch": 0.5505161432022843, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4600205421447754, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6917873620986938, + "num_tokens": 129815734.0, + "step": 5013 + }, + { + "epoch": 0.5506259609048979, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.2532248497009277, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7106776237487793, + "num_tokens": 129842341.0, + "step": 5014 + }, + { + "epoch": 0.5507357786075116, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.0909974575042725, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6999065279960632, + "num_tokens": 129873370.0, + "step": 5015 + }, + { + "epoch": 0.5508455963101252, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4554946422576904, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6993836164474487, + "num_tokens": 129897220.0, + "step": 5016 + }, + { + "epoch": 0.5509554140127388, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3121161460876465, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6981292963027954, + "num_tokens": 129922265.0, + "step": 5017 + }, + { + "epoch": 0.5510652317153525, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.500847578048706, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7010036706924438, + "num_tokens": 129946103.0, + "step": 5018 + }, + { + "epoch": 0.5511750494179661, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.278655529022217, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6967178583145142, + "num_tokens": 129973655.0, + "step": 5019 + }, + { + "epoch": 0.5512848671205799, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3650765419006348, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7223891615867615, + "num_tokens": 129998339.0, + "step": 5020 + }, + { + "epoch": 0.5513946848231935, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.1525213718414307, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.68880295753479, + "num_tokens": 130031600.0, + "step": 5021 + }, + { + "epoch": 0.5515045025258072, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.398530960083008, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.694150447845459, + "num_tokens": 130055600.0, + "step": 5022 + }, + { + "epoch": 0.5516143202284208, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5247421264648438, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7154478430747986, + "num_tokens": 130077860.0, + "step": 5023 + }, + { + "epoch": 0.5517241379310345, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.629298686981201, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7106151580810547, + "num_tokens": 130102686.0, + "step": 5024 + }, + { + "epoch": 0.5518339556336481, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5476503372192383, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6893383860588074, + "num_tokens": 130125007.0, + "step": 5025 + }, + { + "epoch": 0.5519437733362618, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.122903823852539, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.7001802325248718, + "num_tokens": 130154385.0, + "step": 5026 + }, + { + "epoch": 0.5520535910388754, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.374422788619995, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7147221565246582, + "num_tokens": 130179220.0, + "step": 5027 + }, + { + "epoch": 0.5521634087414892, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4074366092681885, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7008256912231445, + "num_tokens": 130204245.0, + "step": 5028 + }, + { + "epoch": 0.5522732264441028, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.047743082046509, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7182093262672424, + "num_tokens": 130237468.0, + "step": 5029 + }, + { + "epoch": 0.5523830441467165, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4356698989868164, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6944847106933594, + "num_tokens": 130261337.0, + "step": 5030 + }, + { + "epoch": 0.5524928618493301, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.2004687786102295, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6879994869232178, + "num_tokens": 130292001.0, + "step": 5031 + }, + { + "epoch": 0.5526026795519438, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.561115026473999, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.696750283241272, + "num_tokens": 130315471.0, + "step": 5032 + }, + { + "epoch": 0.5527124972545574, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4117188453674316, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6938831806182861, + "num_tokens": 130343018.0, + "step": 5033 + }, + { + "epoch": 0.552822314957171, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.359358072280884, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6917299032211304, + "num_tokens": 130368875.0, + "step": 5034 + }, + { + "epoch": 0.5529321326597848, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.238368272781372, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6976803541183472, + "num_tokens": 130396092.0, + "step": 5035 + }, + { + "epoch": 0.5530419503623984, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.448936700820923, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6915080547332764, + "num_tokens": 130420468.0, + "step": 5036 + }, + { + "epoch": 0.5531517680650121, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.742448091506958, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7029629945755005, + "num_tokens": 130440923.0, + "step": 5037 + }, + { + "epoch": 0.5532615857676257, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.32746958732605, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7279605865478516, + "num_tokens": 130465065.0, + "step": 5038 + }, + { + "epoch": 0.5533714034702394, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.613558292388916, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6841703057289124, + "num_tokens": 130487004.0, + "step": 5039 + }, + { + "epoch": 0.553481221172853, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3714137077331543, + "learning_rate": 1e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6750665903091431, + "num_tokens": 130514627.0, + "step": 5040 + }, + { + "epoch": 0.5535910388754667, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5674619674682617, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6836851835250854, + "num_tokens": 130536856.0, + "step": 5041 + }, + { + "epoch": 0.5537008565780804, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5120763778686523, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7049939632415771, + "num_tokens": 130562912.0, + "step": 5042 + }, + { + "epoch": 0.5538106742806941, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.264185667037964, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7046322226524353, + "num_tokens": 130589178.0, + "step": 5043 + }, + { + "epoch": 0.5539204919833077, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3455471992492676, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7004885673522949, + "num_tokens": 130617557.0, + "step": 5044 + }, + { + "epoch": 0.5540303096859214, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.435436248779297, + "learning_rate": 1e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.6811069846153259, + "num_tokens": 130645531.0, + "step": 5045 + }, + { + "epoch": 0.554140127388535, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.7198500633239746, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7022167444229126, + "num_tokens": 130666322.0, + "step": 5046 + }, + { + "epoch": 0.5542499450911487, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.5029900074005127, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7134746313095093, + "num_tokens": 130689313.0, + "step": 5047 + }, + { + "epoch": 0.5543597627937623, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.2450358867645264, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7053066492080688, + "num_tokens": 130717803.0, + "step": 5048 + }, + { + "epoch": 0.5544695804963761, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.111464738845825, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6901241540908813, + "num_tokens": 130747161.0, + "step": 5049 + }, + { + "epoch": 0.5545793981989897, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.582090139389038, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6973641514778137, + "num_tokens": 130768925.0, + "step": 5050 + }, + { + "epoch": 0.5546892159016034, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.4229073524475098, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7026426792144775, + "num_tokens": 130792964.0, + "step": 5051 + }, + { + "epoch": 0.554799033604217, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.6821677684783936, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7150534391403198, + "num_tokens": 130813028.0, + "step": 5052 + }, + { + "epoch": 0.5549088513068307, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.32079815864563, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6995230913162231, + "num_tokens": 130838883.0, + "step": 5053 + }, + { + "epoch": 0.5550186690094443, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.2781877517700195, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6958545446395874, + "num_tokens": 130866937.0, + "step": 5054 + }, + { + "epoch": 0.555128486712058, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3655803203582764, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6875556111335754, + "num_tokens": 130891268.0, + "step": 5055 + }, + { + "epoch": 0.5552383044146716, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.2759087085723877, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6956968307495117, + "num_tokens": 130920607.0, + "step": 5056 + }, + { + "epoch": 0.5553481221172853, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.3598785400390625, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.7011301517486572, + "num_tokens": 130947053.0, + "step": 5057 + }, + { + "epoch": 0.555457939819899, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.701169490814209, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6926769614219666, + "num_tokens": 130968392.0, + "step": 5058 + }, + { + "epoch": 0.5555677575225126, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.54073429107666, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7263758182525635, + "num_tokens": 130990351.0, + "step": 5059 + }, + { + "epoch": 0.5556775752251263, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3603127002716064, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7068018317222595, + "num_tokens": 131016161.0, + "step": 5060 + }, + { + "epoch": 0.5557873929277399, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 2.6136474609375, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7075709104537964, + "num_tokens": 131037117.0, + "step": 5061 + }, + { + "epoch": 0.5558972106303536, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.526639223098755, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6955267786979675, + "num_tokens": 131060013.0, + "step": 5062 + }, + { + "epoch": 0.5560070283329672, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.507915496826172, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6927310824394226, + "num_tokens": 131084337.0, + "step": 5063 + }, + { + "epoch": 0.556116846035581, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2799220085144043, + "learning_rate": 1e-06, + "loss": 1.1408, + "mean_token_accuracy": 0.667794942855835, + "num_tokens": 131116657.0, + "step": 5064 + }, + { + "epoch": 0.5562266637381946, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.464669704437256, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.703624963760376, + "num_tokens": 131140951.0, + "step": 5065 + }, + { + "epoch": 0.5563364814408083, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.347945213317871, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7007306814193726, + "num_tokens": 131166105.0, + "step": 5066 + }, + { + "epoch": 0.5564462991434219, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.210618257522583, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6911876201629639, + "num_tokens": 131197081.0, + "step": 5067 + }, + { + "epoch": 0.5565561168460356, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.6414520740509033, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7232846021652222, + "num_tokens": 131218222.0, + "step": 5068 + }, + { + "epoch": 0.5566659345486492, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.367757797241211, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.692791223526001, + "num_tokens": 131242354.0, + "step": 5069 + }, + { + "epoch": 0.5567757522512629, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.34325909614563, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.705426812171936, + "num_tokens": 131267390.0, + "step": 5070 + }, + { + "epoch": 0.5568855699538766, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3128252029418945, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7003678679466248, + "num_tokens": 131295486.0, + "step": 5071 + }, + { + "epoch": 0.5569953876564903, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.5429694652557373, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7040058374404907, + "num_tokens": 131317985.0, + "step": 5072 + }, + { + "epoch": 0.5571052053591039, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.8695223331451416, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7183205485343933, + "num_tokens": 131336640.0, + "step": 5073 + }, + { + "epoch": 0.5572150230617176, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.405384063720703, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6873295903205872, + "num_tokens": 131361817.0, + "step": 5074 + }, + { + "epoch": 0.5573248407643312, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2660605907440186, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7263146638870239, + "num_tokens": 131388140.0, + "step": 5075 + }, + { + "epoch": 0.5574346584669448, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.430180311203003, + "learning_rate": 1e-06, + "loss": 1.1017, + "mean_token_accuracy": 0.681175947189331, + "num_tokens": 131414579.0, + "step": 5076 + }, + { + "epoch": 0.5575444761695585, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2109146118164062, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7156798839569092, + "num_tokens": 131445760.0, + "step": 5077 + }, + { + "epoch": 0.5576542938721722, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2548739910125732, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6965596675872803, + "num_tokens": 131472746.0, + "step": 5078 + }, + { + "epoch": 0.5577641115747859, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.5184364318847656, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7121209502220154, + "num_tokens": 131494842.0, + "step": 5079 + }, + { + "epoch": 0.5578739292773995, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.5958611965179443, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6998850107192993, + "num_tokens": 131517296.0, + "step": 5080 + }, + { + "epoch": 0.5579837469800132, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.1911165714263916, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6790189146995544, + "num_tokens": 131546387.0, + "step": 5081 + }, + { + "epoch": 0.5580935646826268, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.814349889755249, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.709731936454773, + "num_tokens": 131564509.0, + "step": 5082 + }, + { + "epoch": 0.5582033823852405, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.4635722637176514, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6986147165298462, + "num_tokens": 131588213.0, + "step": 5083 + }, + { + "epoch": 0.5583132000878541, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.299968719482422, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6919516921043396, + "num_tokens": 131616361.0, + "step": 5084 + }, + { + "epoch": 0.5584230177904678, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.1980881690979004, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6864969730377197, + "num_tokens": 131646562.0, + "step": 5085 + }, + { + "epoch": 0.5585328354930815, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3233020305633545, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7083249688148499, + "num_tokens": 131670099.0, + "step": 5086 + }, + { + "epoch": 0.5586426531956952, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3056480884552, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6896508932113647, + "num_tokens": 131697822.0, + "step": 5087 + }, + { + "epoch": 0.5587524708983088, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.4277572631835938, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7096784710884094, + "num_tokens": 131721692.0, + "step": 5088 + }, + { + "epoch": 0.5588622886009225, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.72454833984375, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7023006677627563, + "num_tokens": 131743548.0, + "step": 5089 + }, + { + "epoch": 0.5589721063035361, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.390854597091675, + "learning_rate": 1e-06, + "loss": 1.1698, + "mean_token_accuracy": 0.6662957668304443, + "num_tokens": 131769790.0, + "step": 5090 + }, + { + "epoch": 0.5590819240061498, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.1773524284362793, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6930615901947021, + "num_tokens": 131799241.0, + "step": 5091 + }, + { + "epoch": 0.5591917417087634, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.949126958847046, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6851300597190857, + "num_tokens": 131839024.0, + "step": 5092 + }, + { + "epoch": 0.5593015594113772, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3166940212249756, + "learning_rate": 1e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6822146773338318, + "num_tokens": 131866780.0, + "step": 5093 + }, + { + "epoch": 0.5594113771139908, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.237579822540283, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.695130467414856, + "num_tokens": 131894181.0, + "step": 5094 + }, + { + "epoch": 0.5595211948166045, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.852226734161377, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6924892067909241, + "num_tokens": 131914061.0, + "step": 5095 + }, + { + "epoch": 0.5596310125192181, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.2380611896514893, + "learning_rate": 1e-06, + "loss": 1.1275, + "mean_token_accuracy": 0.6727082133293152, + "num_tokens": 131944439.0, + "step": 5096 + }, + { + "epoch": 0.5597408302218317, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2509231567382812, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7214566469192505, + "num_tokens": 131969212.0, + "step": 5097 + }, + { + "epoch": 0.5598506479244454, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3485195636749268, + "learning_rate": 1e-06, + "loss": 1.1776, + "mean_token_accuracy": 0.6570942997932434, + "num_tokens": 131998108.0, + "step": 5098 + }, + { + "epoch": 0.559960465627059, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.0803494453430176, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6756317615509033, + "num_tokens": 132031655.0, + "step": 5099 + }, + { + "epoch": 0.5600702833296728, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.6163110733032227, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.710962176322937, + "num_tokens": 132051282.0, + "step": 5100 + }, + { + "epoch": 0.5601801010322864, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3206255435943604, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6945167779922485, + "num_tokens": 132075903.0, + "step": 5101 + }, + { + "epoch": 0.5602899187349001, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.36006236076355, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.696513295173645, + "num_tokens": 132101575.0, + "step": 5102 + }, + { + "epoch": 0.5603997364375137, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3156306743621826, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6987067461013794, + "num_tokens": 132127711.0, + "step": 5103 + }, + { + "epoch": 0.5605095541401274, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.5064175128936768, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7147660851478577, + "num_tokens": 132150607.0, + "step": 5104 + }, + { + "epoch": 0.560619371842741, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.5121233463287354, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6993295550346375, + "num_tokens": 132174942.0, + "step": 5105 + }, + { + "epoch": 0.5607291895453547, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.4720375537872314, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7167530059814453, + "num_tokens": 132197226.0, + "step": 5106 + }, + { + "epoch": 0.5608390072479684, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2842414379119873, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7139345407485962, + "num_tokens": 132224700.0, + "step": 5107 + }, + { + "epoch": 0.5609488249505821, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.4080424308776855, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6888754963874817, + "num_tokens": 132249511.0, + "step": 5108 + }, + { + "epoch": 0.5610586426531957, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3117241859436035, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6850684881210327, + "num_tokens": 132276127.0, + "step": 5109 + }, + { + "epoch": 0.5611684603558094, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.390284299850464, + "learning_rate": 1e-06, + "loss": 1.1046, + "mean_token_accuracy": 0.6754822731018066, + "num_tokens": 132306672.0, + "step": 5110 + }, + { + "epoch": 0.561278278058423, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.2841098308563232, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.7019073963165283, + "num_tokens": 132333910.0, + "step": 5111 + }, + { + "epoch": 0.5613880957610367, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3325979709625244, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6833306550979614, + "num_tokens": 132363479.0, + "step": 5112 + }, + { + "epoch": 0.5614979134636503, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.3567423820495605, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7029687166213989, + "num_tokens": 132390880.0, + "step": 5113 + }, + { + "epoch": 0.5616077311662641, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.364131450653076, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7009304761886597, + "num_tokens": 132416496.0, + "step": 5114 + }, + { + "epoch": 0.5617175488688777, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 2.314114809036255, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7079200148582458, + "num_tokens": 132441545.0, + "step": 5115 + }, + { + "epoch": 0.5618273665714913, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.2850899696350098, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7182519435882568, + "num_tokens": 132466047.0, + "step": 5116 + }, + { + "epoch": 0.561937184274105, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.3702690601348877, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7196025848388672, + "num_tokens": 132487881.0, + "step": 5117 + }, + { + "epoch": 0.5620470019767186, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.4340877532958984, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7168471813201904, + "num_tokens": 132515071.0, + "step": 5118 + }, + { + "epoch": 0.5621568196793323, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.473576784133911, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6810351610183716, + "num_tokens": 132539234.0, + "step": 5119 + }, + { + "epoch": 0.5622666373819459, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.214284658432007, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6864970326423645, + "num_tokens": 132569368.0, + "step": 5120 + }, + { + "epoch": 0.5623764550845596, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.280888795852661, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.700141191482544, + "num_tokens": 132595696.0, + "step": 5121 + }, + { + "epoch": 0.5624862727871733, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.236987590789795, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6907855272293091, + "num_tokens": 132627634.0, + "step": 5122 + }, + { + "epoch": 0.562596090489787, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.4505579471588135, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6954889893531799, + "num_tokens": 132652802.0, + "step": 5123 + }, + { + "epoch": 0.5627059081924006, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.403024911880493, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7241818308830261, + "num_tokens": 132677330.0, + "step": 5124 + }, + { + "epoch": 0.5628157258950143, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.272951126098633, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7015508413314819, + "num_tokens": 132703704.0, + "step": 5125 + }, + { + "epoch": 0.5629255435976279, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.3374247550964355, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7101263999938965, + "num_tokens": 132731686.0, + "step": 5126 + }, + { + "epoch": 0.5630353613002416, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.181236743927002, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7111326456069946, + "num_tokens": 132761771.0, + "step": 5127 + }, + { + "epoch": 0.5631451790028552, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.375448226928711, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6889775991439819, + "num_tokens": 132786439.0, + "step": 5128 + }, + { + "epoch": 0.563254996705469, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 4.466264724731445, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7224023938179016, + "num_tokens": 132811005.0, + "step": 5129 + }, + { + "epoch": 0.5633648144080826, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.4020702838897705, + "learning_rate": 1e-06, + "loss": 1.1443, + "mean_token_accuracy": 0.668530285358429, + "num_tokens": 132839399.0, + "step": 5130 + }, + { + "epoch": 0.5634746321106963, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.1705803871154785, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6857339143753052, + "num_tokens": 132867618.0, + "step": 5131 + }, + { + "epoch": 0.5635844498133099, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.007068157196045, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6865555047988892, + "num_tokens": 132901567.0, + "step": 5132 + }, + { + "epoch": 0.5636942675159236, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.4110190868377686, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7150009870529175, + "num_tokens": 132925044.0, + "step": 5133 + }, + { + "epoch": 0.5638040852185372, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.6980199813842773, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7202141284942627, + "num_tokens": 132944209.0, + "step": 5134 + }, + { + "epoch": 0.5639139029211508, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.1957311630249023, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6997436285018921, + "num_tokens": 132973354.0, + "step": 5135 + }, + { + "epoch": 0.5640237206237646, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.6478631496429443, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7304574847221375, + "num_tokens": 132992898.0, + "step": 5136 + }, + { + "epoch": 0.5641335383263782, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.6195201873779297, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6978962421417236, + "num_tokens": 133013935.0, + "step": 5137 + }, + { + "epoch": 0.5642433560289919, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.7483599185943604, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7225655317306519, + "num_tokens": 133033719.0, + "step": 5138 + }, + { + "epoch": 0.5643531737316055, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.195808172225952, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.696161687374115, + "num_tokens": 133062200.0, + "step": 5139 + }, + { + "epoch": 0.5644629914342192, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.308997392654419, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6949406266212463, + "num_tokens": 133090360.0, + "step": 5140 + }, + { + "epoch": 0.5645728091368328, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.373547077178955, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6930510401725769, + "num_tokens": 133117850.0, + "step": 5141 + }, + { + "epoch": 0.5646826268394465, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.443898916244507, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7123982906341553, + "num_tokens": 133142324.0, + "step": 5142 + }, + { + "epoch": 0.5647924445420602, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.573324680328369, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7386947870254517, + "num_tokens": 133163820.0, + "step": 5143 + }, + { + "epoch": 0.5649022622446739, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.306529998779297, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7116575241088867, + "num_tokens": 133190171.0, + "step": 5144 + }, + { + "epoch": 0.5650120799472875, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.666959762573242, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7129648923873901, + "num_tokens": 133211488.0, + "step": 5145 + }, + { + "epoch": 0.5651218976499012, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.707085609436035, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7044289708137512, + "num_tokens": 133232643.0, + "step": 5146 + }, + { + "epoch": 0.5652317153525148, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.8125689029693604, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7084594964981079, + "num_tokens": 133250041.0, + "step": 5147 + }, + { + "epoch": 0.5653415330551285, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.3577053546905518, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.689347505569458, + "num_tokens": 133274638.0, + "step": 5148 + }, + { + "epoch": 0.5654513507577421, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.237691640853882, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6881569027900696, + "num_tokens": 133307670.0, + "step": 5149 + }, + { + "epoch": 0.5655611684603558, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.2885470390319824, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7013431787490845, + "num_tokens": 133332989.0, + "step": 5150 + }, + { + "epoch": 0.5656709861629695, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.1468193531036377, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7054980993270874, + "num_tokens": 133360939.0, + "step": 5151 + }, + { + "epoch": 0.5657808038655832, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.3642990589141846, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7083771228790283, + "num_tokens": 133388730.0, + "step": 5152 + }, + { + "epoch": 0.5658906215681968, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.2912633419036865, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6987585425376892, + "num_tokens": 133416971.0, + "step": 5153 + }, + { + "epoch": 0.5660004392708105, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.323134422302246, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6865249872207642, + "num_tokens": 133444888.0, + "step": 5154 + }, + { + "epoch": 0.5661102569734241, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.306389093399048, + "learning_rate": 1e-06, + "loss": 1.1081, + "mean_token_accuracy": 0.6815137267112732, + "num_tokens": 133471556.0, + "step": 5155 + }, + { + "epoch": 0.5662200746760377, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.132610559463501, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6917165517807007, + "num_tokens": 133504244.0, + "step": 5156 + }, + { + "epoch": 0.5663298923786514, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.165109872817993, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6837494373321533, + "num_tokens": 133533015.0, + "step": 5157 + }, + { + "epoch": 0.5664397100812651, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.7595648765563965, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6909615993499756, + "num_tokens": 133553724.0, + "step": 5158 + }, + { + "epoch": 0.5665495277838788, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.0743649005889893, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6837717294692993, + "num_tokens": 133586088.0, + "step": 5159 + }, + { + "epoch": 0.5666593454864924, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.4111618995666504, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6967124938964844, + "num_tokens": 133610842.0, + "step": 5160 + }, + { + "epoch": 0.5667691631891061, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.5602259635925293, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6980451345443726, + "num_tokens": 133632718.0, + "step": 5161 + }, + { + "epoch": 0.5668789808917197, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.2645931243896484, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6927564740180969, + "num_tokens": 133657697.0, + "step": 5162 + }, + { + "epoch": 0.5669887985943334, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.386037826538086, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.700586199760437, + "num_tokens": 133680641.0, + "step": 5163 + }, + { + "epoch": 0.567098616296947, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.22133207321167, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7209886312484741, + "num_tokens": 133707898.0, + "step": 5164 + }, + { + "epoch": 0.5672084339995608, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.7329013347625732, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.7010742425918579, + "num_tokens": 133729956.0, + "step": 5165 + }, + { + "epoch": 0.5673182517021744, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1680243015289307, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7300860285758972, + "num_tokens": 133756836.0, + "step": 5166 + }, + { + "epoch": 0.5674280694047881, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.471128463745117, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7150111198425293, + "num_tokens": 133778886.0, + "step": 5167 + }, + { + "epoch": 0.5675378871074017, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.326914072036743, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6822194457054138, + "num_tokens": 133809059.0, + "step": 5168 + }, + { + "epoch": 0.5676477048100154, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1268653869628906, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.703113317489624, + "num_tokens": 133839123.0, + "step": 5169 + }, + { + "epoch": 0.567757522512629, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.150601625442505, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7324597835540771, + "num_tokens": 133866533.0, + "step": 5170 + }, + { + "epoch": 0.5678673402152427, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1959755420684814, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6864007711410522, + "num_tokens": 133897919.0, + "step": 5171 + }, + { + "epoch": 0.5679771579178564, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1224365234375, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6952279806137085, + "num_tokens": 133928244.0, + "step": 5172 + }, + { + "epoch": 0.5680869756204701, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.323322057723999, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6890337467193604, + "num_tokens": 133956029.0, + "step": 5173 + }, + { + "epoch": 0.5681967933230837, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.374894857406616, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6804504990577698, + "num_tokens": 133986082.0, + "step": 5174 + }, + { + "epoch": 0.5683066110256974, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4278488159179688, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6955375671386719, + "num_tokens": 134011899.0, + "step": 5175 + }, + { + "epoch": 0.568416428728311, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 2.6215288639068604, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7035346031188965, + "num_tokens": 134032637.0, + "step": 5176 + }, + { + "epoch": 0.5685262464309246, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3939836025238037, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7038744688034058, + "num_tokens": 134056987.0, + "step": 5177 + }, + { + "epoch": 0.5686360641335383, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.523960590362549, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6882110834121704, + "num_tokens": 134081097.0, + "step": 5178 + }, + { + "epoch": 0.5687458818361519, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.5559439659118652, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7067008018493652, + "num_tokens": 134104706.0, + "step": 5179 + }, + { + "epoch": 0.5688556995387657, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.352368116378784, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6853480339050293, + "num_tokens": 134131050.0, + "step": 5180 + }, + { + "epoch": 0.5689655172413793, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.417559862136841, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6907598972320557, + "num_tokens": 134157262.0, + "step": 5181 + }, + { + "epoch": 0.569075334943993, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.192295789718628, + "learning_rate": 1e-06, + "loss": 1.1675, + "mean_token_accuracy": 0.6667364239692688, + "num_tokens": 134188743.0, + "step": 5182 + }, + { + "epoch": 0.5691851526466066, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3135826587677, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.7021963596343994, + "num_tokens": 134215418.0, + "step": 5183 + }, + { + "epoch": 0.5692949703492203, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.9825125932693481, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7071383595466614, + "num_tokens": 134248000.0, + "step": 5184 + }, + { + "epoch": 0.5694047880518339, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4323513507843018, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7086659669876099, + "num_tokens": 134271086.0, + "step": 5185 + }, + { + "epoch": 0.5695146057544476, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.463512659072876, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7130448818206787, + "num_tokens": 134293514.0, + "step": 5186 + }, + { + "epoch": 0.5696244234570613, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2734556198120117, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6836845278739929, + "num_tokens": 134324497.0, + "step": 5187 + }, + { + "epoch": 0.569734241159675, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.376866579055786, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7071309089660645, + "num_tokens": 134348726.0, + "step": 5188 + }, + { + "epoch": 0.5698440588622886, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.402256965637207, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.71323561668396, + "num_tokens": 134375115.0, + "step": 5189 + }, + { + "epoch": 0.5699538765649023, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.579864025115967, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6814343929290771, + "num_tokens": 134397922.0, + "step": 5190 + }, + { + "epoch": 0.5700636942675159, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2516918182373047, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.71825110912323, + "num_tokens": 134425011.0, + "step": 5191 + }, + { + "epoch": 0.5701735119701296, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.660200595855713, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6922768354415894, + "num_tokens": 134452007.0, + "step": 5192 + }, + { + "epoch": 0.5702833296727432, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.557668924331665, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6968700885772705, + "num_tokens": 134480460.0, + "step": 5193 + }, + { + "epoch": 0.570393147375357, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.294949769973755, + "learning_rate": 1e-06, + "loss": 1.1234, + "mean_token_accuracy": 0.6746103763580322, + "num_tokens": 134507478.0, + "step": 5194 + }, + { + "epoch": 0.5705029650779706, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.536299228668213, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6872604489326477, + "num_tokens": 134531741.0, + "step": 5195 + }, + { + "epoch": 0.5706127827805842, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.355207681655884, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.705263078212738, + "num_tokens": 134557636.0, + "step": 5196 + }, + { + "epoch": 0.5707226004831979, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4521610736846924, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7001626491546631, + "num_tokens": 134579402.0, + "step": 5197 + }, + { + "epoch": 0.5708324181858115, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4306561946868896, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7015594244003296, + "num_tokens": 134606547.0, + "step": 5198 + }, + { + "epoch": 0.5709422358884252, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.441939115524292, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7162879705429077, + "num_tokens": 134630304.0, + "step": 5199 + }, + { + "epoch": 0.5710520535910388, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.432816982269287, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6991386413574219, + "num_tokens": 134654483.0, + "step": 5200 + }, + { + "epoch": 0.5711618712936526, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1469271183013916, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6785012483596802, + "num_tokens": 134687064.0, + "step": 5201 + }, + { + "epoch": 0.5712716889962662, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.299004316329956, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7071820497512817, + "num_tokens": 134713343.0, + "step": 5202 + }, + { + "epoch": 0.5713815066988799, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.544523000717163, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6839522123336792, + "num_tokens": 134736151.0, + "step": 5203 + }, + { + "epoch": 0.5714913244014935, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 3.8867814540863037, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.717963695526123, + "num_tokens": 134760650.0, + "step": 5204 + }, + { + "epoch": 0.5716011421041072, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.7310588359832764, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7062005400657654, + "num_tokens": 134780341.0, + "step": 5205 + }, + { + "epoch": 0.5717109598067208, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4239418506622314, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6995933055877686, + "num_tokens": 134804164.0, + "step": 5206 + }, + { + "epoch": 0.5718207775093345, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.272284984588623, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.70068359375, + "num_tokens": 134830883.0, + "step": 5207 + }, + { + "epoch": 0.5719305952119481, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3750884532928467, + "learning_rate": 1e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.6820988655090332, + "num_tokens": 134857646.0, + "step": 5208 + }, + { + "epoch": 0.5720404129145619, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1890671253204346, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7041918039321899, + "num_tokens": 134886570.0, + "step": 5209 + }, + { + "epoch": 0.5721502306171755, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 3.806391477584839, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7182559967041016, + "num_tokens": 134909165.0, + "step": 5210 + }, + { + "epoch": 0.5722600483197892, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.364229679107666, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.7008889317512512, + "num_tokens": 134935723.0, + "step": 5211 + }, + { + "epoch": 0.5723698660224028, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.8205487728118896, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7082422971725464, + "num_tokens": 134952760.0, + "step": 5212 + }, + { + "epoch": 0.5724796837250165, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.323294162750244, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7028443217277527, + "num_tokens": 134978712.0, + "step": 5213 + }, + { + "epoch": 0.5725895014276301, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2600951194763184, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6804255843162537, + "num_tokens": 135007948.0, + "step": 5214 + }, + { + "epoch": 0.5726993191302437, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.073185443878174, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6865630149841309, + "num_tokens": 135039118.0, + "step": 5215 + }, + { + "epoch": 0.5728091368328575, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.9660557508468628, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7012129426002502, + "num_tokens": 135076922.0, + "step": 5216 + }, + { + "epoch": 0.5729189545354711, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1632161140441895, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6842486262321472, + "num_tokens": 135106676.0, + "step": 5217 + }, + { + "epoch": 0.5730287722380848, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4658210277557373, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.714316725730896, + "num_tokens": 135127502.0, + "step": 5218 + }, + { + "epoch": 0.5731385899406984, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.37550950050354, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6791247129440308, + "num_tokens": 135152895.0, + "step": 5219 + }, + { + "epoch": 0.5732484076433121, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.089094400405884, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7054354548454285, + "num_tokens": 135185791.0, + "step": 5220 + }, + { + "epoch": 0.5733582253459257, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 3.7165462970733643, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6861549615859985, + "num_tokens": 135216237.0, + "step": 5221 + }, + { + "epoch": 0.5734680430485394, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3747105598449707, + "learning_rate": 1e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.6795668005943298, + "num_tokens": 135241407.0, + "step": 5222 + }, + { + "epoch": 0.5735778607511531, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.5849156379699707, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7181702852249146, + "num_tokens": 135262741.0, + "step": 5223 + }, + { + "epoch": 0.5736876784537668, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.334152936935425, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7295159101486206, + "num_tokens": 135285964.0, + "step": 5224 + }, + { + "epoch": 0.5737974961563804, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.289818048477173, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.689116895198822, + "num_tokens": 135314741.0, + "step": 5225 + }, + { + "epoch": 0.5739073138589941, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.5056591033935547, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6992253065109253, + "num_tokens": 135336909.0, + "step": 5226 + }, + { + "epoch": 0.5740171315616077, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.17960524559021, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7241525650024414, + "num_tokens": 135364698.0, + "step": 5227 + }, + { + "epoch": 0.5741269492642214, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3380253314971924, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7014610767364502, + "num_tokens": 135389786.0, + "step": 5228 + }, + { + "epoch": 0.574236766966835, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3612704277038574, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7151792645454407, + "num_tokens": 135413067.0, + "step": 5229 + }, + { + "epoch": 0.5743465846694488, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.69343638420105, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7028213143348694, + "num_tokens": 135436186.0, + "step": 5230 + }, + { + "epoch": 0.5744564023720624, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.6111176013946533, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7284299731254578, + "num_tokens": 135457201.0, + "step": 5231 + }, + { + "epoch": 0.5745662200746761, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3996801376342773, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7080235481262207, + "num_tokens": 135481982.0, + "step": 5232 + }, + { + "epoch": 0.5746760377772897, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.602036714553833, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6884799003601074, + "num_tokens": 135505097.0, + "step": 5233 + }, + { + "epoch": 0.5747858554799034, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.226915121078491, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6926915645599365, + "num_tokens": 135533256.0, + "step": 5234 + }, + { + "epoch": 0.574895673182517, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2011358737945557, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7169418334960938, + "num_tokens": 135559313.0, + "step": 5235 + }, + { + "epoch": 0.5750054908851306, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3479154109954834, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7075765132904053, + "num_tokens": 135583174.0, + "step": 5236 + }, + { + "epoch": 0.5751153085877443, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.351651668548584, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7009420990943909, + "num_tokens": 135610717.0, + "step": 5237 + }, + { + "epoch": 0.575225126290358, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3343822956085205, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6947648525238037, + "num_tokens": 135635496.0, + "step": 5238 + }, + { + "epoch": 0.5753349439929717, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.182234764099121, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7198247909545898, + "num_tokens": 135662797.0, + "step": 5239 + }, + { + "epoch": 0.5754447616955853, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.756251335144043, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7363698482513428, + "num_tokens": 135681463.0, + "step": 5240 + }, + { + "epoch": 0.575554579398199, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1095452308654785, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.694150984287262, + "num_tokens": 135712559.0, + "step": 5241 + }, + { + "epoch": 0.5756643971008126, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4803335666656494, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7112976312637329, + "num_tokens": 135735649.0, + "step": 5242 + }, + { + "epoch": 0.5757742148034263, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3076791763305664, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6951442360877991, + "num_tokens": 135763058.0, + "step": 5243 + }, + { + "epoch": 0.5758840325060399, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3863284587860107, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7115853428840637, + "num_tokens": 135785873.0, + "step": 5244 + }, + { + "epoch": 0.5759938502086537, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.426872491836548, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7147650122642517, + "num_tokens": 135809402.0, + "step": 5245 + }, + { + "epoch": 0.5761036679112673, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.6511590480804443, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6860929727554321, + "num_tokens": 135832842.0, + "step": 5246 + }, + { + "epoch": 0.576213485613881, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.390714645385742, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6965193748474121, + "num_tokens": 135859345.0, + "step": 5247 + }, + { + "epoch": 0.5763233033164946, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2631325721740723, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7035893797874451, + "num_tokens": 135885536.0, + "step": 5248 + }, + { + "epoch": 0.5764331210191083, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3655622005462646, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6790599226951599, + "num_tokens": 135912140.0, + "step": 5249 + }, + { + "epoch": 0.5765429387217219, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1764445304870605, + "learning_rate": 1e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6771023273468018, + "num_tokens": 135942276.0, + "step": 5250 + }, + { + "epoch": 0.5766527564243356, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.238682508468628, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7058053016662598, + "num_tokens": 135969380.0, + "step": 5251 + }, + { + "epoch": 0.5767625741269493, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.144681453704834, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7230287790298462, + "num_tokens": 135997935.0, + "step": 5252 + }, + { + "epoch": 0.576872391829563, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3565874099731445, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6928406357765198, + "num_tokens": 136022158.0, + "step": 5253 + }, + { + "epoch": 0.5769822095321766, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2113518714904785, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7060444355010986, + "num_tokens": 136048515.0, + "step": 5254 + }, + { + "epoch": 0.5770920272347903, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.370277166366577, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.707362949848175, + "num_tokens": 136072347.0, + "step": 5255 + }, + { + "epoch": 0.5772018449374039, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.36007022857666, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6784590482711792, + "num_tokens": 136098672.0, + "step": 5256 + }, + { + "epoch": 0.5773116626400175, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.408311128616333, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6963481903076172, + "num_tokens": 136123095.0, + "step": 5257 + }, + { + "epoch": 0.5774214803426312, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3451552391052246, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6797454953193665, + "num_tokens": 136150673.0, + "step": 5258 + }, + { + "epoch": 0.577531298045245, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.058039903640747, + "learning_rate": 1e-06, + "loss": 1.089, + "mean_token_accuracy": 0.6853658556938171, + "num_tokens": 136185183.0, + "step": 5259 + }, + { + "epoch": 0.5776411157478586, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.0416982173919678, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7068753242492676, + "num_tokens": 136217660.0, + "step": 5260 + }, + { + "epoch": 0.5777509334504722, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4397506713867188, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7114053964614868, + "num_tokens": 136240480.0, + "step": 5261 + }, + { + "epoch": 0.5778607511530859, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.436168909072876, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6864389181137085, + "num_tokens": 136265258.0, + "step": 5262 + }, + { + "epoch": 0.5779705688556995, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3820912837982178, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.700880765914917, + "num_tokens": 136291840.0, + "step": 5263 + }, + { + "epoch": 0.5780803865583132, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1789095401763916, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7093539237976074, + "num_tokens": 136320057.0, + "step": 5264 + }, + { + "epoch": 0.5781902042609268, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.215601921081543, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6812106370925903, + "num_tokens": 136350046.0, + "step": 5265 + }, + { + "epoch": 0.5783000219635406, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.351691722869873, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7088106870651245, + "num_tokens": 136378683.0, + "step": 5266 + }, + { + "epoch": 0.5784098396661542, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.209275245666504, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6796000003814697, + "num_tokens": 136408297.0, + "step": 5267 + }, + { + "epoch": 0.5785196573687679, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1549503803253174, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6764166355133057, + "num_tokens": 136438358.0, + "step": 5268 + }, + { + "epoch": 0.5786294750713815, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2130672931671143, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.7008264064788818, + "num_tokens": 136464614.0, + "step": 5269 + }, + { + "epoch": 0.5787392927739952, + "ewc_loss": 1.2099742889404297e-05, + "grad_norm": 8.510485649108887, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7015562653541565, + "num_tokens": 136487921.0, + "step": 5270 + }, + { + "epoch": 0.5788491104766088, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.401174783706665, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6981881856918335, + "num_tokens": 136512898.0, + "step": 5271 + }, + { + "epoch": 0.5789589281792225, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.279503107070923, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7241373062133789, + "num_tokens": 136539048.0, + "step": 5272 + }, + { + "epoch": 0.5790687458818361, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 3.817660331726074, + "learning_rate": 1e-06, + "loss": 1.1026, + "mean_token_accuracy": 0.6739743947982788, + "num_tokens": 136567431.0, + "step": 5273 + }, + { + "epoch": 0.5791785635844499, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.33762788772583, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7105364799499512, + "num_tokens": 136593366.0, + "step": 5274 + }, + { + "epoch": 0.5792883812870635, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.6242966651916504, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6877977848052979, + "num_tokens": 136614886.0, + "step": 5275 + }, + { + "epoch": 0.5793981989896771, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2542052268981934, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6842032670974731, + "num_tokens": 136644948.0, + "step": 5276 + }, + { + "epoch": 0.5795080166922908, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.6034610271453857, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7026744484901428, + "num_tokens": 136667190.0, + "step": 5277 + }, + { + "epoch": 0.5796178343949044, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.314284563064575, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7168697118759155, + "num_tokens": 136691470.0, + "step": 5278 + }, + { + "epoch": 0.5797276520975181, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1561262607574463, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6980396509170532, + "num_tokens": 136720589.0, + "step": 5279 + }, + { + "epoch": 0.5798374698001317, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.118954658508301, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6848801970481873, + "num_tokens": 136751274.0, + "step": 5280 + }, + { + "epoch": 0.5799472875027455, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.7589094638824463, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6936476826667786, + "num_tokens": 136772880.0, + "step": 5281 + }, + { + "epoch": 0.5800571052053591, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.259126663208008, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6978483200073242, + "num_tokens": 136800349.0, + "step": 5282 + }, + { + "epoch": 0.5801669229079728, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.681800365447998, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7183457612991333, + "num_tokens": 136819943.0, + "step": 5283 + }, + { + "epoch": 0.5802767406105864, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.185159683227539, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6989567279815674, + "num_tokens": 136849882.0, + "step": 5284 + }, + { + "epoch": 0.5803865583132001, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2321622371673584, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.673758864402771, + "num_tokens": 136878293.0, + "step": 5285 + }, + { + "epoch": 0.5804963760158137, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2772810459136963, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6928709745407104, + "num_tokens": 136906377.0, + "step": 5286 + }, + { + "epoch": 0.5806061937184274, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2451062202453613, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.692048966884613, + "num_tokens": 136937276.0, + "step": 5287 + }, + { + "epoch": 0.5807160114210411, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.333211898803711, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6992247104644775, + "num_tokens": 136962371.0, + "step": 5288 + }, + { + "epoch": 0.5808258291236548, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2179760932922363, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.713875949382782, + "num_tokens": 136989852.0, + "step": 5289 + }, + { + "epoch": 0.5809356468262684, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.7079648971557617, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7082058191299438, + "num_tokens": 137008127.0, + "step": 5290 + }, + { + "epoch": 0.5810454645288821, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3181779384613037, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.739920973777771, + "num_tokens": 137031663.0, + "step": 5291 + }, + { + "epoch": 0.5811552822314957, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3250527381896973, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6949288845062256, + "num_tokens": 137059073.0, + "step": 5292 + }, + { + "epoch": 0.5812650999341094, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.561384439468384, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7263822555541992, + "num_tokens": 137079263.0, + "step": 5293 + }, + { + "epoch": 0.581374917636723, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1763534545898438, + "learning_rate": 1e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6794629693031311, + "num_tokens": 137111579.0, + "step": 5294 + }, + { + "epoch": 0.5814847353393368, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.2720797061920166, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7180455923080444, + "num_tokens": 137136444.0, + "step": 5295 + }, + { + "epoch": 0.5815945530419504, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3262221813201904, + "learning_rate": 1e-06, + "loss": 1.1623, + "mean_token_accuracy": 0.6583948135375977, + "num_tokens": 137165499.0, + "step": 5296 + }, + { + "epoch": 0.581704370744564, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.1622462272644043, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6942676305770874, + "num_tokens": 137194506.0, + "step": 5297 + }, + { + "epoch": 0.5818141884471777, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.3018202781677246, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6986849308013916, + "num_tokens": 137222222.0, + "step": 5298 + }, + { + "epoch": 0.5819240061497913, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4088258743286133, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6872514486312866, + "num_tokens": 137248256.0, + "step": 5299 + }, + { + "epoch": 0.582033823852405, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.339099407196045, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7331528663635254, + "num_tokens": 137272022.0, + "step": 5300 + }, + { + "epoch": 0.5821436415550186, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.4777681827545166, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7008068561553955, + "num_tokens": 137294545.0, + "step": 5301 + }, + { + "epoch": 0.5822534592576323, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 2.6973624229431152, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7072572708129883, + "num_tokens": 137314838.0, + "step": 5302 + }, + { + "epoch": 0.582363276960246, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2985470294952393, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6956102252006531, + "num_tokens": 137340879.0, + "step": 5303 + }, + { + "epoch": 0.5824730946628597, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3556909561157227, + "learning_rate": 1e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6902540326118469, + "num_tokens": 137368376.0, + "step": 5304 + }, + { + "epoch": 0.5825829123654733, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.284242868423462, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6952605247497559, + "num_tokens": 137396813.0, + "step": 5305 + }, + { + "epoch": 0.582692730068087, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.35514497756958, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7014868855476379, + "num_tokens": 137421814.0, + "step": 5306 + }, + { + "epoch": 0.5828025477707006, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.332216501235962, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7020856142044067, + "num_tokens": 137448987.0, + "step": 5307 + }, + { + "epoch": 0.5829123654733143, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1163711547851562, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7004687786102295, + "num_tokens": 137478806.0, + "step": 5308 + }, + { + "epoch": 0.5830221831759279, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.330449104309082, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6814748048782349, + "num_tokens": 137504325.0, + "step": 5309 + }, + { + "epoch": 0.5831320008785417, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.6406362056732178, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7021201848983765, + "num_tokens": 137525204.0, + "step": 5310 + }, + { + "epoch": 0.5832418185811553, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3959593772888184, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7073544263839722, + "num_tokens": 137551618.0, + "step": 5311 + }, + { + "epoch": 0.583351636283769, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3018815517425537, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7048979997634888, + "num_tokens": 137577095.0, + "step": 5312 + }, + { + "epoch": 0.5834614539863826, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.240790367126465, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.71286541223526, + "num_tokens": 137606454.0, + "step": 5313 + }, + { + "epoch": 0.5835712716889963, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.260380744934082, + "learning_rate": 1e-06, + "loss": 1.1132, + "mean_token_accuracy": 0.6696922779083252, + "num_tokens": 137634449.0, + "step": 5314 + }, + { + "epoch": 0.5836810893916099, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3535635471343994, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.7003499269485474, + "num_tokens": 137658097.0, + "step": 5315 + }, + { + "epoch": 0.5837909070942235, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4967877864837646, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6845451593399048, + "num_tokens": 137683043.0, + "step": 5316 + }, + { + "epoch": 0.5839007247968373, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.6222548484802246, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7181757092475891, + "num_tokens": 137703335.0, + "step": 5317 + }, + { + "epoch": 0.584010542499451, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.6365573406219482, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7061914801597595, + "num_tokens": 137722898.0, + "step": 5318 + }, + { + "epoch": 0.5841203602020646, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0787320137023926, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7041318416595459, + "num_tokens": 137755357.0, + "step": 5319 + }, + { + "epoch": 0.5842301779046782, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3325891494750977, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6984747052192688, + "num_tokens": 137785996.0, + "step": 5320 + }, + { + "epoch": 0.5843399956072919, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3661108016967773, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.7017378807067871, + "num_tokens": 137812226.0, + "step": 5321 + }, + { + "epoch": 0.5844498133099055, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0661380290985107, + "learning_rate": 1e-06, + "loss": 1.1661, + "mean_token_accuracy": 0.6625472903251648, + "num_tokens": 137845770.0, + "step": 5322 + }, + { + "epoch": 0.5845596310125192, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2248342037200928, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.694700300693512, + "num_tokens": 137874955.0, + "step": 5323 + }, + { + "epoch": 0.5846694487151329, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.05503249168396, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7083325982093811, + "num_tokens": 137908104.0, + "step": 5324 + }, + { + "epoch": 0.5847792664177466, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4222280979156494, + "learning_rate": 1e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6788524389266968, + "num_tokens": 137933143.0, + "step": 5325 + }, + { + "epoch": 0.5848890841203602, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.187598943710327, + "learning_rate": 1e-06, + "loss": 1.0899, + "mean_token_accuracy": 0.6802679300308228, + "num_tokens": 137963416.0, + "step": 5326 + }, + { + "epoch": 0.5849989018229739, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.339531898498535, + "learning_rate": 1e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.6686313152313232, + "num_tokens": 137990308.0, + "step": 5327 + }, + { + "epoch": 0.5851087195255875, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4204633235931396, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7221613526344299, + "num_tokens": 138012509.0, + "step": 5328 + }, + { + "epoch": 0.5852185372282012, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.672597885131836, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7167425751686096, + "num_tokens": 138032504.0, + "step": 5329 + }, + { + "epoch": 0.5853283549308148, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2672643661499023, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7236679792404175, + "num_tokens": 138057651.0, + "step": 5330 + }, + { + "epoch": 0.5854381726334285, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2429604530334473, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7166062593460083, + "num_tokens": 138086199.0, + "step": 5331 + }, + { + "epoch": 0.5855479903360422, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.39351224899292, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7132695317268372, + "num_tokens": 138113881.0, + "step": 5332 + }, + { + "epoch": 0.5856578080386559, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.713900566101074, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7184834480285645, + "num_tokens": 138135806.0, + "step": 5333 + }, + { + "epoch": 0.5857676257412695, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4222655296325684, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7105203866958618, + "num_tokens": 138159020.0, + "step": 5334 + }, + { + "epoch": 0.5858774434438832, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.281540870666504, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7003947496414185, + "num_tokens": 138184601.0, + "step": 5335 + }, + { + "epoch": 0.5859872611464968, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.309476137161255, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6837680339813232, + "num_tokens": 138212102.0, + "step": 5336 + }, + { + "epoch": 0.5860970788491104, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.488485813140869, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6820536255836487, + "num_tokens": 138238813.0, + "step": 5337 + }, + { + "epoch": 0.5862068965517241, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.54673433303833, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6957142949104309, + "num_tokens": 138260141.0, + "step": 5338 + }, + { + "epoch": 0.5863167142543378, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3237504959106445, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7246580719947815, + "num_tokens": 138283651.0, + "step": 5339 + }, + { + "epoch": 0.5864265319569515, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.100473165512085, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.695589542388916, + "num_tokens": 138315435.0, + "step": 5340 + }, + { + "epoch": 0.5865363496595651, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.286376953125, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.706932783126831, + "num_tokens": 138342803.0, + "step": 5341 + }, + { + "epoch": 0.5866461673621788, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.5735273361206055, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6929806470870972, + "num_tokens": 138365491.0, + "step": 5342 + }, + { + "epoch": 0.5867559850647924, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0731489658355713, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7169903516769409, + "num_tokens": 138395065.0, + "step": 5343 + }, + { + "epoch": 0.5868658027674061, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.417919635772705, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6801597476005554, + "num_tokens": 138420604.0, + "step": 5344 + }, + { + "epoch": 0.5869756204700197, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.220233201980591, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7006371021270752, + "num_tokens": 138448634.0, + "step": 5345 + }, + { + "epoch": 0.5870854381726335, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2011847496032715, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6961797475814819, + "num_tokens": 138477350.0, + "step": 5346 + }, + { + "epoch": 0.5871952558752471, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.651341438293457, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7048689126968384, + "num_tokens": 138499062.0, + "step": 5347 + }, + { + "epoch": 0.5873050735778608, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.5907559394836426, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.699225902557373, + "num_tokens": 138520145.0, + "step": 5348 + }, + { + "epoch": 0.5874148912804744, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.6516032218933105, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6987436413764954, + "num_tokens": 138541908.0, + "step": 5349 + }, + { + "epoch": 0.5875247089830881, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 7.031937599182129, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7363570928573608, + "num_tokens": 138566648.0, + "step": 5350 + }, + { + "epoch": 0.5876345266857017, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.263902187347412, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7132874131202698, + "num_tokens": 138593706.0, + "step": 5351 + }, + { + "epoch": 0.5877443443883154, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3274052143096924, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7028549909591675, + "num_tokens": 138622965.0, + "step": 5352 + }, + { + "epoch": 0.5878541620909291, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4426956176757812, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6888972520828247, + "num_tokens": 138649693.0, + "step": 5353 + }, + { + "epoch": 0.5879639797935428, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0306806564331055, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6921229958534241, + "num_tokens": 138681641.0, + "step": 5354 + }, + { + "epoch": 0.5880737974961564, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.457803249359131, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6956309676170349, + "num_tokens": 138705096.0, + "step": 5355 + }, + { + "epoch": 0.58818361519877, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 32.36302947998047, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.709523618221283, + "num_tokens": 138725318.0, + "step": 5356 + }, + { + "epoch": 0.5882934329013837, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.373434543609619, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7072883248329163, + "num_tokens": 138752583.0, + "step": 5357 + }, + { + "epoch": 0.5884032506039973, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.501389980316162, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.709989607334137, + "num_tokens": 138775165.0, + "step": 5358 + }, + { + "epoch": 0.588513068306611, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1976981163024902, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7080717086791992, + "num_tokens": 138803464.0, + "step": 5359 + }, + { + "epoch": 0.5886228860092246, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.332547903060913, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6922577023506165, + "num_tokens": 138830517.0, + "step": 5360 + }, + { + "epoch": 0.5887327037118384, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.5258665084838867, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.7000021934509277, + "num_tokens": 138852743.0, + "step": 5361 + }, + { + "epoch": 0.588842521414452, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.345738410949707, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7032815217971802, + "num_tokens": 138877770.0, + "step": 5362 + }, + { + "epoch": 0.5889523391170657, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1649205684661865, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6964700222015381, + "num_tokens": 138907114.0, + "step": 5363 + }, + { + "epoch": 0.5890621568196793, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.170461654663086, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.710915744304657, + "num_tokens": 138934234.0, + "step": 5364 + }, + { + "epoch": 0.589171974522293, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.7591326236724854, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7048498392105103, + "num_tokens": 138953750.0, + "step": 5365 + }, + { + "epoch": 0.5892817922249066, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2010202407836914, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7027295827865601, + "num_tokens": 138985148.0, + "step": 5366 + }, + { + "epoch": 0.5893916099275203, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.354585647583008, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7179913520812988, + "num_tokens": 139009650.0, + "step": 5367 + }, + { + "epoch": 0.589501427630134, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4719913005828857, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7236802577972412, + "num_tokens": 139030423.0, + "step": 5368 + }, + { + "epoch": 0.5896112453327477, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4215025901794434, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7002503871917725, + "num_tokens": 139056646.0, + "step": 5369 + }, + { + "epoch": 0.5897210630353613, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.686805486679077, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7174649238586426, + "num_tokens": 139076987.0, + "step": 5370 + }, + { + "epoch": 0.589830880737975, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.238495349884033, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7230910062789917, + "num_tokens": 139101805.0, + "step": 5371 + }, + { + "epoch": 0.5899406984405886, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.168999671936035, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6975794434547424, + "num_tokens": 139129225.0, + "step": 5372 + }, + { + "epoch": 0.5900505161432023, + "ewc_loss": 1.2218952178955078e-05, + "grad_norm": 7.60496711730957, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.708501935005188, + "num_tokens": 139155600.0, + "step": 5373 + }, + { + "epoch": 0.5901603338458159, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3957200050354004, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6876857280731201, + "num_tokens": 139183045.0, + "step": 5374 + }, + { + "epoch": 0.5902701515484297, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0364131927490234, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.700090765953064, + "num_tokens": 139216042.0, + "step": 5375 + }, + { + "epoch": 0.5903799692510433, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.4325921535491943, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.705510139465332, + "num_tokens": 139240011.0, + "step": 5376 + }, + { + "epoch": 0.590489786953657, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3196449279785156, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7112947106361389, + "num_tokens": 139268069.0, + "step": 5377 + }, + { + "epoch": 0.5905996046562706, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.632388114929199, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7199079394340515, + "num_tokens": 139289186.0, + "step": 5378 + }, + { + "epoch": 0.5907094223588842, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.230747938156128, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7043113112449646, + "num_tokens": 139318862.0, + "step": 5379 + }, + { + "epoch": 0.5908192400614979, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4727673530578613, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7011171579360962, + "num_tokens": 139342120.0, + "step": 5380 + }, + { + "epoch": 0.5909290577641115, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1128175258636475, + "learning_rate": 1e-06, + "loss": 1.1528, + "mean_token_accuracy": 0.6622828245162964, + "num_tokens": 139374511.0, + "step": 5381 + }, + { + "epoch": 0.5910388754667253, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.6156463623046875, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7229072451591492, + "num_tokens": 139395596.0, + "step": 5382 + }, + { + "epoch": 0.5911486931693389, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3678371906280518, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6925005912780762, + "num_tokens": 139420950.0, + "step": 5383 + }, + { + "epoch": 0.5912585108719526, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.243675947189331, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7113363742828369, + "num_tokens": 139449011.0, + "step": 5384 + }, + { + "epoch": 0.5913683285745662, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0156421661376953, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6892448663711548, + "num_tokens": 139480828.0, + "step": 5385 + }, + { + "epoch": 0.5914781462771799, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2933509349823, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6988810300827026, + "num_tokens": 139507020.0, + "step": 5386 + }, + { + "epoch": 0.5915879639797935, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2438039779663086, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.699322521686554, + "num_tokens": 139534335.0, + "step": 5387 + }, + { + "epoch": 0.5916977816824072, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.210780382156372, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.70567786693573, + "num_tokens": 139561396.0, + "step": 5388 + }, + { + "epoch": 0.5918075993850208, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.243739604949951, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6864681839942932, + "num_tokens": 139588667.0, + "step": 5389 + }, + { + "epoch": 0.5919174170876346, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1653623580932617, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6867859363555908, + "num_tokens": 139621085.0, + "step": 5390 + }, + { + "epoch": 0.5920272347902482, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4187517166137695, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7213674187660217, + "num_tokens": 139644535.0, + "step": 5391 + }, + { + "epoch": 0.5921370524928619, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.055551290512085, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.7037161588668823, + "num_tokens": 139675145.0, + "step": 5392 + }, + { + "epoch": 0.5922468701954755, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4263174533843994, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7119923830032349, + "num_tokens": 139698271.0, + "step": 5393 + }, + { + "epoch": 0.5923566878980892, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1126625537872314, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.705035924911499, + "num_tokens": 139728259.0, + "step": 5394 + }, + { + "epoch": 0.5924665056007028, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2836828231811523, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7152665853500366, + "num_tokens": 139751081.0, + "step": 5395 + }, + { + "epoch": 0.5925763233033164, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.0846006870269775, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6952077746391296, + "num_tokens": 139783392.0, + "step": 5396 + }, + { + "epoch": 0.5926861410059302, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.269282341003418, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.7029374837875366, + "num_tokens": 139809224.0, + "step": 5397 + }, + { + "epoch": 0.5927959587085438, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.283316135406494, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7291029691696167, + "num_tokens": 139835349.0, + "step": 5398 + }, + { + "epoch": 0.5929057764111575, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.424717426300049, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7203749418258667, + "num_tokens": 139858710.0, + "step": 5399 + }, + { + "epoch": 0.5930155941137711, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3267874717712402, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6916961669921875, + "num_tokens": 139883838.0, + "step": 5400 + }, + { + "epoch": 0.5931254118163848, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4506494998931885, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.696182131767273, + "num_tokens": 139908596.0, + "step": 5401 + }, + { + "epoch": 0.5932352295189984, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3016419410705566, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.7043187618255615, + "num_tokens": 139937133.0, + "step": 5402 + }, + { + "epoch": 0.5933450472216121, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.374779462814331, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6873812675476074, + "num_tokens": 139962948.0, + "step": 5403 + }, + { + "epoch": 0.5934548649242258, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.1883163452148438, + "learning_rate": 1e-06, + "loss": 1.1461, + "mean_token_accuracy": 0.6608720421791077, + "num_tokens": 139993617.0, + "step": 5404 + }, + { + "epoch": 0.5935646826268395, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2504634857177734, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6942648887634277, + "num_tokens": 140021652.0, + "step": 5405 + }, + { + "epoch": 0.5936745003294531, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.27119779586792, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6930310726165771, + "num_tokens": 140049633.0, + "step": 5406 + }, + { + "epoch": 0.5937843180320668, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.559622287750244, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7359969615936279, + "num_tokens": 140070854.0, + "step": 5407 + }, + { + "epoch": 0.5938941357346804, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.4289445877075195, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7048197388648987, + "num_tokens": 140094943.0, + "step": 5408 + }, + { + "epoch": 0.5940039534372941, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.65627384185791, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7028017044067383, + "num_tokens": 140114169.0, + "step": 5409 + }, + { + "epoch": 0.5941137711399077, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.316471576690674, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6992735266685486, + "num_tokens": 140139128.0, + "step": 5410 + }, + { + "epoch": 0.5942235888425215, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.3887386322021484, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7150989770889282, + "num_tokens": 140165992.0, + "step": 5411 + }, + { + "epoch": 0.5943334065451351, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.517157793045044, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6874728202819824, + "num_tokens": 140190420.0, + "step": 5412 + }, + { + "epoch": 0.5944432242477488, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.2260518074035645, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7129600048065186, + "num_tokens": 140218466.0, + "step": 5413 + }, + { + "epoch": 0.5945530419503624, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.4963128566741943, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7168792486190796, + "num_tokens": 140241243.0, + "step": 5414 + }, + { + "epoch": 0.594662859652976, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.376823902130127, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7066200375556946, + "num_tokens": 140269198.0, + "step": 5415 + }, + { + "epoch": 0.5947726773555897, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.376518487930298, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7250362634658813, + "num_tokens": 140291919.0, + "step": 5416 + }, + { + "epoch": 0.5948824950582033, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 2.526475191116333, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7152963876724243, + "num_tokens": 140314477.0, + "step": 5417 + }, + { + "epoch": 0.5949923127608171, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.5109949111938477, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7044476270675659, + "num_tokens": 140337521.0, + "step": 5418 + }, + { + "epoch": 0.5951021304634307, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.4736921787261963, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.695904016494751, + "num_tokens": 140360715.0, + "step": 5419 + }, + { + "epoch": 0.5952119481660444, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.4105827808380127, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7012317776679993, + "num_tokens": 140384016.0, + "step": 5420 + }, + { + "epoch": 0.595321765868658, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.411769390106201, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7251678705215454, + "num_tokens": 140408011.0, + "step": 5421 + }, + { + "epoch": 0.5954315835712717, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.054779291152954, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.7015407681465149, + "num_tokens": 140439623.0, + "step": 5422 + }, + { + "epoch": 0.5955414012738853, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.406026601791382, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6865271925926208, + "num_tokens": 140463762.0, + "step": 5423 + }, + { + "epoch": 0.595651218976499, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3231289386749268, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6850581765174866, + "num_tokens": 140489649.0, + "step": 5424 + }, + { + "epoch": 0.5957610366791126, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.6464731693267822, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7045080065727234, + "num_tokens": 140509941.0, + "step": 5425 + }, + { + "epoch": 0.5958708543817264, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3996167182922363, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6938493251800537, + "num_tokens": 140535871.0, + "step": 5426 + }, + { + "epoch": 0.59598067208434, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3022193908691406, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7070304155349731, + "num_tokens": 140562930.0, + "step": 5427 + }, + { + "epoch": 0.5960904897869537, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.2144227027893066, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.712209939956665, + "num_tokens": 140590643.0, + "step": 5428 + }, + { + "epoch": 0.5962003074895673, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.5495011806488037, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6916577816009521, + "num_tokens": 140615200.0, + "step": 5429 + }, + { + "epoch": 0.596310125192181, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 3.9771788120269775, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.715579628944397, + "num_tokens": 140645253.0, + "step": 5430 + }, + { + "epoch": 0.5964199428947946, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.432985305786133, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7195626497268677, + "num_tokens": 140667899.0, + "step": 5431 + }, + { + "epoch": 0.5965297605974083, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.5363452434539795, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6888407468795776, + "num_tokens": 140691109.0, + "step": 5432 + }, + { + "epoch": 0.596639578300022, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.354804754257202, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6920984983444214, + "num_tokens": 140717798.0, + "step": 5433 + }, + { + "epoch": 0.5967493960026357, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.0926835536956787, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6898549199104309, + "num_tokens": 140749119.0, + "step": 5434 + }, + { + "epoch": 0.5968592137052493, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.2918283939361572, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.711220383644104, + "num_tokens": 140774407.0, + "step": 5435 + }, + { + "epoch": 0.596969031407863, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3323678970336914, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6950240135192871, + "num_tokens": 140800384.0, + "step": 5436 + }, + { + "epoch": 0.5970788491104766, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3939762115478516, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7169532775878906, + "num_tokens": 140823824.0, + "step": 5437 + }, + { + "epoch": 0.5971886668130902, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.533737897872925, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7019567489624023, + "num_tokens": 140845377.0, + "step": 5438 + }, + { + "epoch": 0.5972984845157039, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3086607456207275, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7140693664550781, + "num_tokens": 140869853.0, + "step": 5439 + }, + { + "epoch": 0.5974083022183176, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.3952677249908447, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7203956842422485, + "num_tokens": 140893376.0, + "step": 5440 + }, + { + "epoch": 0.5975181199209313, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.0521883964538574, + "learning_rate": 1e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.6697808504104614, + "num_tokens": 140930049.0, + "step": 5441 + }, + { + "epoch": 0.5976279376235449, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.15786075592041, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.6777783632278442, + "num_tokens": 140960774.0, + "step": 5442 + }, + { + "epoch": 0.5977377553261586, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.023197889328003, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6957427859306335, + "num_tokens": 140991688.0, + "step": 5443 + }, + { + "epoch": 0.5978475730287722, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.2809457778930664, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7094086408615112, + "num_tokens": 141018534.0, + "step": 5444 + }, + { + "epoch": 0.5979573907313859, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.623525857925415, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6929185390472412, + "num_tokens": 141040847.0, + "step": 5445 + }, + { + "epoch": 0.5980672084339995, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.2515811920166016, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6913357973098755, + "num_tokens": 141068941.0, + "step": 5446 + }, + { + "epoch": 0.5981770261366133, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.833888530731201, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7269062995910645, + "num_tokens": 141088818.0, + "step": 5447 + }, + { + "epoch": 0.5982868438392269, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.280442714691162, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7394965887069702, + "num_tokens": 141111050.0, + "step": 5448 + }, + { + "epoch": 0.5983966615418406, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.326155424118042, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7219560146331787, + "num_tokens": 141134434.0, + "step": 5449 + }, + { + "epoch": 0.5985064792444542, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.582270860671997, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7141770720481873, + "num_tokens": 141156074.0, + "step": 5450 + }, + { + "epoch": 0.5986162969470679, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.2473490238189697, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6967686414718628, + "num_tokens": 141184073.0, + "step": 5451 + }, + { + "epoch": 0.5987261146496815, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 2.334813117980957, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7088085412979126, + "num_tokens": 141207261.0, + "step": 5452 + }, + { + "epoch": 0.5988359323522952, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.6584479808807373, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7151002883911133, + "num_tokens": 141227943.0, + "step": 5453 + }, + { + "epoch": 0.5989457500549088, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.0832464694976807, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6954730153083801, + "num_tokens": 141259938.0, + "step": 5454 + }, + { + "epoch": 0.5990555677575226, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.6062278747558594, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7009106874465942, + "num_tokens": 141281372.0, + "step": 5455 + }, + { + "epoch": 0.5991653854601362, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.5694262981414795, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7159069776535034, + "num_tokens": 141301792.0, + "step": 5456 + }, + { + "epoch": 0.5992752031627498, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2769570350646973, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7010284662246704, + "num_tokens": 141330241.0, + "step": 5457 + }, + { + "epoch": 0.5993850208653635, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.20082950592041, + "learning_rate": 1e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.6909396648406982, + "num_tokens": 141360496.0, + "step": 5458 + }, + { + "epoch": 0.5994948385679771, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.309267997741699, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6878591775894165, + "num_tokens": 141386431.0, + "step": 5459 + }, + { + "epoch": 0.5996046562705908, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2644710540771484, + "learning_rate": 1e-06, + "loss": 1.1379, + "mean_token_accuracy": 0.6656782031059265, + "num_tokens": 141415944.0, + "step": 5460 + }, + { + "epoch": 0.5997144739732044, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2631030082702637, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6982835531234741, + "num_tokens": 141444005.0, + "step": 5461 + }, + { + "epoch": 0.5998242916758182, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.313913583755493, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6888457536697388, + "num_tokens": 141470558.0, + "step": 5462 + }, + { + "epoch": 0.5999341093784318, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2176709175109863, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6873601675033569, + "num_tokens": 141499619.0, + "step": 5463 + }, + { + "epoch": 0.6000439270810455, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.559776782989502, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7193291783332825, + "num_tokens": 141519971.0, + "step": 5464 + }, + { + "epoch": 0.6001537447836591, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.278463363647461, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7205418944358826, + "num_tokens": 141547413.0, + "step": 5465 + }, + { + "epoch": 0.6002635624862728, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.013901948928833, + "learning_rate": 1e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.6766030192375183, + "num_tokens": 141582883.0, + "step": 5466 + }, + { + "epoch": 0.6003733801888864, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.329791307449341, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7015730738639832, + "num_tokens": 141609111.0, + "step": 5467 + }, + { + "epoch": 0.6004831978915001, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.328282117843628, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7198909521102905, + "num_tokens": 141632586.0, + "step": 5468 + }, + { + "epoch": 0.6005930155941138, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.632856845855713, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7127200365066528, + "num_tokens": 141653030.0, + "step": 5469 + }, + { + "epoch": 0.6007028332967275, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2607359886169434, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6910381317138672, + "num_tokens": 141680762.0, + "step": 5470 + }, + { + "epoch": 0.6008126509993411, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.4470088481903076, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.687506377696991, + "num_tokens": 141705139.0, + "step": 5471 + }, + { + "epoch": 0.6009224687019548, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.3244142532348633, + "learning_rate": 1e-06, + "loss": 1.0994, + "mean_token_accuracy": 0.6784380674362183, + "num_tokens": 141732248.0, + "step": 5472 + }, + { + "epoch": 0.6010322864045684, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.532803535461426, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6862272024154663, + "num_tokens": 141756347.0, + "step": 5473 + }, + { + "epoch": 0.601142104107182, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.0290589332580566, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6865655183792114, + "num_tokens": 141790860.0, + "step": 5474 + }, + { + "epoch": 0.6012519218097957, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2915685176849365, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7015437483787537, + "num_tokens": 141818768.0, + "step": 5475 + }, + { + "epoch": 0.6013617395124095, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.1683099269866943, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7144591808319092, + "num_tokens": 141846167.0, + "step": 5476 + }, + { + "epoch": 0.6014715572150231, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.607178211212158, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7058895230293274, + "num_tokens": 141867109.0, + "step": 5477 + }, + { + "epoch": 0.6015813749176367, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.234950304031372, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6884101033210754, + "num_tokens": 141894244.0, + "step": 5478 + }, + { + "epoch": 0.6016911926202504, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.45988130569458, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.711536705493927, + "num_tokens": 141918529.0, + "step": 5479 + }, + { + "epoch": 0.601801010322864, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.425297260284424, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7089509963989258, + "num_tokens": 141941287.0, + "step": 5480 + }, + { + "epoch": 0.6019108280254777, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.49306058883667, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6824031472206116, + "num_tokens": 141965353.0, + "step": 5481 + }, + { + "epoch": 0.6020206457280913, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.4219329357147217, + "learning_rate": 1e-06, + "loss": 1.1383, + "mean_token_accuracy": 0.6736838817596436, + "num_tokens": 141992824.0, + "step": 5482 + }, + { + "epoch": 0.602130463430705, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.546947717666626, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6876919269561768, + "num_tokens": 142016379.0, + "step": 5483 + }, + { + "epoch": 0.6022402811333187, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.0934572219848633, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6806194186210632, + "num_tokens": 142051236.0, + "step": 5484 + }, + { + "epoch": 0.6023500988359324, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.4349377155303955, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6915833950042725, + "num_tokens": 142076418.0, + "step": 5485 + }, + { + "epoch": 0.602459916538546, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.702310800552368, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.718076229095459, + "num_tokens": 142096091.0, + "step": 5486 + }, + { + "epoch": 0.6025697342411597, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.2322893142700195, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7049506306648254, + "num_tokens": 142124345.0, + "step": 5487 + }, + { + "epoch": 0.6026795519437733, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.3657846450805664, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6868343353271484, + "num_tokens": 142154203.0, + "step": 5488 + }, + { + "epoch": 0.602789369646387, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.396925210952759, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6966867446899414, + "num_tokens": 142178440.0, + "step": 5489 + }, + { + "epoch": 0.6028991873490006, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.482053756713867, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7070108652114868, + "num_tokens": 142203260.0, + "step": 5490 + }, + { + "epoch": 0.6030090050516144, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 2.744457483291626, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7114886045455933, + "num_tokens": 142222642.0, + "step": 5491 + }, + { + "epoch": 0.603118822754228, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.405748128890991, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7098274230957031, + "num_tokens": 142248504.0, + "step": 5492 + }, + { + "epoch": 0.6032286404568417, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.359151601791382, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.70452880859375, + "num_tokens": 142275888.0, + "step": 5493 + }, + { + "epoch": 0.6033384581594553, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.358938455581665, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6949148178100586, + "num_tokens": 142302175.0, + "step": 5494 + }, + { + "epoch": 0.603448275862069, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.498380422592163, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7070286273956299, + "num_tokens": 142326863.0, + "step": 5495 + }, + { + "epoch": 0.6035580935646826, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.1596946716308594, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.7027299404144287, + "num_tokens": 142355356.0, + "step": 5496 + }, + { + "epoch": 0.6036679112672962, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.4025981426239014, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7047700881958008, + "num_tokens": 142379442.0, + "step": 5497 + }, + { + "epoch": 0.60377772896991, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.420982599258423, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7007511854171753, + "num_tokens": 142402486.0, + "step": 5498 + }, + { + "epoch": 0.6038875466725236, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.451570510864258, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6907118558883667, + "num_tokens": 142427756.0, + "step": 5499 + }, + { + "epoch": 0.6039973643751373, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.2143499851226807, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7025265693664551, + "num_tokens": 142456504.0, + "step": 5500 + }, + { + "epoch": 0.6041071820777509, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.261857271194458, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7168647646903992, + "num_tokens": 142481999.0, + "step": 5501 + }, + { + "epoch": 0.6042169997803646, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.2419652938842773, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7128173112869263, + "num_tokens": 142508085.0, + "step": 5502 + }, + { + "epoch": 0.6043268174829782, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.5391831398010254, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.679679274559021, + "num_tokens": 142533502.0, + "step": 5503 + }, + { + "epoch": 0.6044366351855919, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.2478179931640625, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7056201696395874, + "num_tokens": 142562232.0, + "step": 5504 + }, + { + "epoch": 0.6045464528882056, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.3794519901275635, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7158634066581726, + "num_tokens": 142584535.0, + "step": 5505 + }, + { + "epoch": 0.6046562705908193, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.210737466812134, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6964387893676758, + "num_tokens": 142613548.0, + "step": 5506 + }, + { + "epoch": 0.6047660882934329, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4815213680267334, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7005309462547302, + "num_tokens": 142634501.0, + "step": 5507 + }, + { + "epoch": 0.6048759059960466, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3528339862823486, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.709651529788971, + "num_tokens": 142660053.0, + "step": 5508 + }, + { + "epoch": 0.6049857236986602, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.228916645050049, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6996141672134399, + "num_tokens": 142686587.0, + "step": 5509 + }, + { + "epoch": 0.6050955414012739, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.371591329574585, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6974003314971924, + "num_tokens": 142710842.0, + "step": 5510 + }, + { + "epoch": 0.6052053591038875, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.0424158573150635, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6959149241447449, + "num_tokens": 142744140.0, + "step": 5511 + }, + { + "epoch": 0.6053151768065012, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.281831979751587, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.703114926815033, + "num_tokens": 142769131.0, + "step": 5512 + }, + { + "epoch": 0.6054249945091149, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3726425170898438, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.7084481716156006, + "num_tokens": 142798422.0, + "step": 5513 + }, + { + "epoch": 0.6055348122117286, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.504666805267334, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7102511525154114, + "num_tokens": 142820626.0, + "step": 5514 + }, + { + "epoch": 0.6056446299143422, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.37593674659729, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7117826342582703, + "num_tokens": 142842981.0, + "step": 5515 + }, + { + "epoch": 0.6057544476169558, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.6086785793304443, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7067470550537109, + "num_tokens": 142862928.0, + "step": 5516 + }, + { + "epoch": 0.6058642653195695, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3744447231292725, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6921233534812927, + "num_tokens": 142891872.0, + "step": 5517 + }, + { + "epoch": 0.6059740830221831, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.218601942062378, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.699052631855011, + "num_tokens": 142919028.0, + "step": 5518 + }, + { + "epoch": 0.6060839007247968, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.137483835220337, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7152875661849976, + "num_tokens": 142947374.0, + "step": 5519 + }, + { + "epoch": 0.6061937184274105, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2776217460632324, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6854961514472961, + "num_tokens": 142974095.0, + "step": 5520 + }, + { + "epoch": 0.6063035361300242, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.546675205230713, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6860216856002808, + "num_tokens": 142995967.0, + "step": 5521 + }, + { + "epoch": 0.6064133538326378, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2006471157073975, + "learning_rate": 1e-06, + "loss": 1.0952, + "mean_token_accuracy": 0.6766588091850281, + "num_tokens": 143025221.0, + "step": 5522 + }, + { + "epoch": 0.6065231715352515, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.292264461517334, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.700432538986206, + "num_tokens": 143051527.0, + "step": 5523 + }, + { + "epoch": 0.6066329892378651, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5261011123657227, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6939606666564941, + "num_tokens": 143075544.0, + "step": 5524 + }, + { + "epoch": 0.6067428069404788, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.263542413711548, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7184971570968628, + "num_tokens": 143099855.0, + "step": 5525 + }, + { + "epoch": 0.6068526246430924, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.126668930053711, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.710453450679779, + "num_tokens": 143129817.0, + "step": 5526 + }, + { + "epoch": 0.6069624423457062, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.593118906021118, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6959821581840515, + "num_tokens": 143150698.0, + "step": 5527 + }, + { + "epoch": 0.6070722600483198, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 2.5694799423217773, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.702407956123352, + "num_tokens": 143171043.0, + "step": 5528 + }, + { + "epoch": 0.6071820777509335, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.403585910797119, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7132550477981567, + "num_tokens": 143193800.0, + "step": 5529 + }, + { + "epoch": 0.6072918954535471, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.192730665206909, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6940326690673828, + "num_tokens": 143222251.0, + "step": 5530 + }, + { + "epoch": 0.6074017131561608, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5456597805023193, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.709251880645752, + "num_tokens": 143245623.0, + "step": 5531 + }, + { + "epoch": 0.6075115308587744, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3208260536193848, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7178740501403809, + "num_tokens": 143271328.0, + "step": 5532 + }, + { + "epoch": 0.607621348561388, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.7483320236206055, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7167807221412659, + "num_tokens": 143290653.0, + "step": 5533 + }, + { + "epoch": 0.6077311662640018, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2498223781585693, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7225455045700073, + "num_tokens": 143316964.0, + "step": 5534 + }, + { + "epoch": 0.6078409839666155, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5459349155426025, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7207326889038086, + "num_tokens": 143339087.0, + "step": 5535 + }, + { + "epoch": 0.6079508016692291, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.0747592449188232, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6981849670410156, + "num_tokens": 143372999.0, + "step": 5536 + }, + { + "epoch": 0.6080606193718427, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4592695236206055, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.700305700302124, + "num_tokens": 143398034.0, + "step": 5537 + }, + { + "epoch": 0.6081704370744564, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.477717638015747, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7162008285522461, + "num_tokens": 143419511.0, + "step": 5538 + }, + { + "epoch": 0.60828025477707, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.182079315185547, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6884272694587708, + "num_tokens": 143447620.0, + "step": 5539 + }, + { + "epoch": 0.6083900724796837, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4928691387176514, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6928658485412598, + "num_tokens": 143470927.0, + "step": 5540 + }, + { + "epoch": 0.6084998901822973, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3197524547576904, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7027274370193481, + "num_tokens": 143495362.0, + "step": 5541 + }, + { + "epoch": 0.6086097078849111, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.269240140914917, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7110130786895752, + "num_tokens": 143522298.0, + "step": 5542 + }, + { + "epoch": 0.6087195255875247, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.660341501235962, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7161309719085693, + "num_tokens": 143540762.0, + "step": 5543 + }, + { + "epoch": 0.6088293432901384, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.602200508117676, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7271939516067505, + "num_tokens": 143562935.0, + "step": 5544 + }, + { + "epoch": 0.608939160992752, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5020394325256348, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6951847076416016, + "num_tokens": 143586717.0, + "step": 5545 + }, + { + "epoch": 0.6090489786953657, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2991816997528076, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6867598295211792, + "num_tokens": 143613936.0, + "step": 5546 + }, + { + "epoch": 0.6091587963979793, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2977705001831055, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7079827785491943, + "num_tokens": 143641181.0, + "step": 5547 + }, + { + "epoch": 0.609268614100593, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2920517921447754, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7026165723800659, + "num_tokens": 143666141.0, + "step": 5548 + }, + { + "epoch": 0.6093784318032067, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.229509115219116, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6968995332717896, + "num_tokens": 143694096.0, + "step": 5549 + }, + { + "epoch": 0.6094882495058204, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.146833658218384, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6929305791854858, + "num_tokens": 143724483.0, + "step": 5550 + }, + { + "epoch": 0.609598067208434, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3748059272766113, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7090412974357605, + "num_tokens": 143748850.0, + "step": 5551 + }, + { + "epoch": 0.6097078849110477, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2914037704467773, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6960141658782959, + "num_tokens": 143776175.0, + "step": 5552 + }, + { + "epoch": 0.6098177026136613, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3613595962524414, + "learning_rate": 1e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6862548589706421, + "num_tokens": 143804042.0, + "step": 5553 + }, + { + "epoch": 0.609927520316275, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2734947204589844, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7040525674819946, + "num_tokens": 143831258.0, + "step": 5554 + }, + { + "epoch": 0.6100373380188886, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.1306371688842773, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6912071704864502, + "num_tokens": 143861686.0, + "step": 5555 + }, + { + "epoch": 0.6101471557215024, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5836403369903564, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6978409290313721, + "num_tokens": 143883454.0, + "step": 5556 + }, + { + "epoch": 0.610256973424116, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.402465581893921, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.7038009166717529, + "num_tokens": 143907384.0, + "step": 5557 + }, + { + "epoch": 0.6103667911267296, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3302855491638184, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6906532049179077, + "num_tokens": 143934663.0, + "step": 5558 + }, + { + "epoch": 0.6104766088293433, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.177276134490967, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7087785005569458, + "num_tokens": 143962719.0, + "step": 5559 + }, + { + "epoch": 0.6105864265319569, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.9363934993743896, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7084049582481384, + "num_tokens": 143983202.0, + "step": 5560 + }, + { + "epoch": 0.6106962442345706, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.574525833129883, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7111437320709229, + "num_tokens": 144004714.0, + "step": 5561 + }, + { + "epoch": 0.6108060619371842, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.705355405807495, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6945358514785767, + "num_tokens": 144025861.0, + "step": 5562 + }, + { + "epoch": 0.610915879639798, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.49133038520813, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7118081450462341, + "num_tokens": 144048018.0, + "step": 5563 + }, + { + "epoch": 0.6110256973424116, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2400410175323486, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7232248783111572, + "num_tokens": 144073098.0, + "step": 5564 + }, + { + "epoch": 0.6111355150450253, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.156689405441284, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6904653310775757, + "num_tokens": 144103692.0, + "step": 5565 + }, + { + "epoch": 0.6112453327476389, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2811896800994873, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6966689825057983, + "num_tokens": 144130071.0, + "step": 5566 + }, + { + "epoch": 0.6113551504502526, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.066884994506836, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7100026607513428, + "num_tokens": 144160736.0, + "step": 5567 + }, + { + "epoch": 0.6114649681528662, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.290759801864624, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7134520411491394, + "num_tokens": 144187160.0, + "step": 5568 + }, + { + "epoch": 0.6115747858554799, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.409309148788452, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6895347833633423, + "num_tokens": 144215373.0, + "step": 5569 + }, + { + "epoch": 0.6116846035580935, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.085219621658325, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7204238772392273, + "num_tokens": 144246010.0, + "step": 5570 + }, + { + "epoch": 0.6117944212607073, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4210026264190674, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6867725253105164, + "num_tokens": 144271534.0, + "step": 5571 + }, + { + "epoch": 0.6119042389633209, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.294818162918091, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6942710876464844, + "num_tokens": 144296962.0, + "step": 5572 + }, + { + "epoch": 0.6120140566659346, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.586526870727539, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7143633365631104, + "num_tokens": 144319660.0, + "step": 5573 + }, + { + "epoch": 0.6121238743685482, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.207071304321289, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7032804489135742, + "num_tokens": 144345996.0, + "step": 5574 + }, + { + "epoch": 0.6122336920711619, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.396306037902832, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7162832617759705, + "num_tokens": 144370199.0, + "step": 5575 + }, + { + "epoch": 0.6123435097737755, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.260587692260742, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6802961826324463, + "num_tokens": 144398358.0, + "step": 5576 + }, + { + "epoch": 0.6124533274763891, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.492687702178955, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.70943683385849, + "num_tokens": 144422666.0, + "step": 5577 + }, + { + "epoch": 0.6125631451790029, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.156511068344116, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6882369518280029, + "num_tokens": 144453770.0, + "step": 5578 + }, + { + "epoch": 0.6126729628816165, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.066056489944458, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6949722170829773, + "num_tokens": 144487826.0, + "step": 5579 + }, + { + "epoch": 0.6127827805842302, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3941636085510254, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7187550067901611, + "num_tokens": 144510583.0, + "step": 5580 + }, + { + "epoch": 0.6128925982868438, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3399996757507324, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7045645713806152, + "num_tokens": 144537090.0, + "step": 5581 + }, + { + "epoch": 0.6130024159894575, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.9467014074325562, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7054497599601746, + "num_tokens": 144573721.0, + "step": 5582 + }, + { + "epoch": 0.6131122336920711, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3021039962768555, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6978013515472412, + "num_tokens": 144600228.0, + "step": 5583 + }, + { + "epoch": 0.6132220513946848, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3208441734313965, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6953791379928589, + "num_tokens": 144624825.0, + "step": 5584 + }, + { + "epoch": 0.6133318690972985, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.1636099815368652, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7051339149475098, + "num_tokens": 144653870.0, + "step": 5585 + }, + { + "epoch": 0.6134416867999122, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.186974287033081, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.697769045829773, + "num_tokens": 144682126.0, + "step": 5586 + }, + { + "epoch": 0.6135515045025258, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2215676307678223, + "learning_rate": 1e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.6755180358886719, + "num_tokens": 144711108.0, + "step": 5587 + }, + { + "epoch": 0.6136613222051395, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4762468338012695, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7349749207496643, + "num_tokens": 144732059.0, + "step": 5588 + }, + { + "epoch": 0.6137711399077531, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.355848550796509, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.716259241104126, + "num_tokens": 144756402.0, + "step": 5589 + }, + { + "epoch": 0.6138809576103668, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.668877363204956, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.698452353477478, + "num_tokens": 144778478.0, + "step": 5590 + }, + { + "epoch": 0.6139907753129804, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.332939386367798, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6938788890838623, + "num_tokens": 144803171.0, + "step": 5591 + }, + { + "epoch": 0.6141005930155942, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3499608039855957, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6940812468528748, + "num_tokens": 144829122.0, + "step": 5592 + }, + { + "epoch": 0.6142104107182078, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4360692501068115, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7001104354858398, + "num_tokens": 144853840.0, + "step": 5593 + }, + { + "epoch": 0.6143202284208215, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2857284545898438, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7271714210510254, + "num_tokens": 144879799.0, + "step": 5594 + }, + { + "epoch": 0.6144300461234351, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.259049892425537, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7275698184967041, + "num_tokens": 144903876.0, + "step": 5595 + }, + { + "epoch": 0.6145398638260487, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.259021520614624, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6863099932670593, + "num_tokens": 144932095.0, + "step": 5596 + }, + { + "epoch": 0.6146496815286624, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.418011426925659, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7122128009796143, + "num_tokens": 144955621.0, + "step": 5597 + }, + { + "epoch": 0.614759499231276, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2527618408203125, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6997964978218079, + "num_tokens": 144985577.0, + "step": 5598 + }, + { + "epoch": 0.6148693169338898, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.627919912338257, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7262387871742249, + "num_tokens": 145005958.0, + "step": 5599 + }, + { + "epoch": 0.6149791346365034, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.6711859703063965, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7138501405715942, + "num_tokens": 145025793.0, + "step": 5600 + }, + { + "epoch": 0.6150889523391171, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.180595874786377, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6963491439819336, + "num_tokens": 145056561.0, + "step": 5601 + }, + { + "epoch": 0.6151987700417307, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.16031813621521, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7183137536048889, + "num_tokens": 145084877.0, + "step": 5602 + }, + { + "epoch": 0.6153085877443444, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.4204440116882324, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7246119976043701, + "num_tokens": 145108660.0, + "step": 5603 + }, + { + "epoch": 0.615418405446958, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3396239280700684, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7055892944335938, + "num_tokens": 145132828.0, + "step": 5604 + }, + { + "epoch": 0.6155282231495717, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.436572551727295, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6993520259857178, + "num_tokens": 145154962.0, + "step": 5605 + }, + { + "epoch": 0.6156380408521853, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.613985300064087, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6802517175674438, + "num_tokens": 145182406.0, + "step": 5606 + }, + { + "epoch": 0.6157478585547991, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3620615005493164, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6846871972084045, + "num_tokens": 145208430.0, + "step": 5607 + }, + { + "epoch": 0.6158576762574127, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.337386131286621, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7071101069450378, + "num_tokens": 145236623.0, + "step": 5608 + }, + { + "epoch": 0.6159674939600264, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.3436834812164307, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7058739066123962, + "num_tokens": 145261618.0, + "step": 5609 + }, + { + "epoch": 0.61607731166264, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.3394360542297363, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6952568292617798, + "num_tokens": 145291071.0, + "step": 5610 + }, + { + "epoch": 0.6161871293652537, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.0946812629699707, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7331146597862244, + "num_tokens": 145323857.0, + "step": 5611 + }, + { + "epoch": 0.6162969470678673, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.301745653152466, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6879527568817139, + "num_tokens": 145353905.0, + "step": 5612 + }, + { + "epoch": 0.616406764770481, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.1224310398101807, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.718298614025116, + "num_tokens": 145384056.0, + "step": 5613 + }, + { + "epoch": 0.6165165824730947, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.3001935482025146, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7254728078842163, + "num_tokens": 145410449.0, + "step": 5614 + }, + { + "epoch": 0.6166264001757084, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.469305992126465, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.723315417766571, + "num_tokens": 145433886.0, + "step": 5615 + }, + { + "epoch": 0.616736217878322, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.7935903072357178, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6857998371124268, + "num_tokens": 145455812.0, + "step": 5616 + }, + { + "epoch": 0.6168460355809356, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.292509078979492, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6867265701293945, + "num_tokens": 145483198.0, + "step": 5617 + }, + { + "epoch": 0.6169558532835493, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.425408124923706, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7065839767456055, + "num_tokens": 145508062.0, + "step": 5618 + }, + { + "epoch": 0.6170656709861629, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.324728012084961, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6937677264213562, + "num_tokens": 145533803.0, + "step": 5619 + }, + { + "epoch": 0.6171754886887766, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.4973883628845215, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6869866847991943, + "num_tokens": 145561163.0, + "step": 5620 + }, + { + "epoch": 0.6172853063913903, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2829301357269287, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7028909921646118, + "num_tokens": 145587924.0, + "step": 5621 + }, + { + "epoch": 0.617395124094004, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.2235162258148193, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7114728689193726, + "num_tokens": 145614186.0, + "step": 5622 + }, + { + "epoch": 0.6175049417966176, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.422473430633545, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7133548259735107, + "num_tokens": 145636444.0, + "step": 5623 + }, + { + "epoch": 0.6176147594992313, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.1935513019561768, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.7006540298461914, + "num_tokens": 145665371.0, + "step": 5624 + }, + { + "epoch": 0.6177245772018449, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.334014415740967, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7010467052459717, + "num_tokens": 145689331.0, + "step": 5625 + }, + { + "epoch": 0.6178343949044586, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.6584584712982178, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6993131637573242, + "num_tokens": 145709752.0, + "step": 5626 + }, + { + "epoch": 0.6179442126070722, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.5228824615478516, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7125887870788574, + "num_tokens": 145732554.0, + "step": 5627 + }, + { + "epoch": 0.618054030309686, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.040602684020996, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6800196170806885, + "num_tokens": 145768213.0, + "step": 5628 + }, + { + "epoch": 0.6181638480122996, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5975162982940674, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7144697308540344, + "num_tokens": 145788714.0, + "step": 5629 + }, + { + "epoch": 0.6182736657149133, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 2.5009958744049072, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7156847715377808, + "num_tokens": 145811217.0, + "step": 5630 + }, + { + "epoch": 0.6183834834175269, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.419210910797119, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6945265531539917, + "num_tokens": 145834633.0, + "step": 5631 + }, + { + "epoch": 0.6184933011201406, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.6504247188568115, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7023758888244629, + "num_tokens": 145855250.0, + "step": 5632 + }, + { + "epoch": 0.6186031188227542, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.1050820350646973, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6922078728675842, + "num_tokens": 145888420.0, + "step": 5633 + }, + { + "epoch": 0.6187129365253679, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.4472293853759766, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7074133157730103, + "num_tokens": 145911814.0, + "step": 5634 + }, + { + "epoch": 0.6188227542279815, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.3305630683898926, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7423722743988037, + "num_tokens": 145937059.0, + "step": 5635 + }, + { + "epoch": 0.6189325719305953, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.3922946453094482, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.7000810503959656, + "num_tokens": 145964732.0, + "step": 5636 + }, + { + "epoch": 0.6190423896332089, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.2736990451812744, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6975815296173096, + "num_tokens": 145990592.0, + "step": 5637 + }, + { + "epoch": 0.6191522073358225, + "ewc_loss": 1.2516975402832031e-05, + "grad_norm": 2.151935577392578, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6910288333892822, + "num_tokens": 146020581.0, + "step": 5638 + }, + { + "epoch": 0.6192620250384362, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.4066028594970703, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6907858848571777, + "num_tokens": 146045852.0, + "step": 5639 + }, + { + "epoch": 0.6193718427410498, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.353565216064453, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7402888536453247, + "num_tokens": 146069988.0, + "step": 5640 + }, + { + "epoch": 0.6194816604436635, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.710768461227417, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7024308443069458, + "num_tokens": 146090334.0, + "step": 5641 + }, + { + "epoch": 0.6195914781462771, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.445466995239258, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7087289094924927, + "num_tokens": 146113724.0, + "step": 5642 + }, + { + "epoch": 0.6197012958488909, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.593822956085205, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7104958295822144, + "num_tokens": 146138973.0, + "step": 5643 + }, + { + "epoch": 0.6198111135515045, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.109053373336792, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7286299467086792, + "num_tokens": 146168630.0, + "step": 5644 + }, + { + "epoch": 0.6199209312541182, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.205127000808716, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6878018379211426, + "num_tokens": 146198815.0, + "step": 5645 + }, + { + "epoch": 0.6200307489567318, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.6539740562438965, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.724657416343689, + "num_tokens": 146219566.0, + "step": 5646 + }, + { + "epoch": 0.6201405666593455, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.437471866607666, + "learning_rate": 1e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.6804473996162415, + "num_tokens": 146243718.0, + "step": 5647 + }, + { + "epoch": 0.6202503843619591, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.3678836822509766, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6962431073188782, + "num_tokens": 146268636.0, + "step": 5648 + }, + { + "epoch": 0.6203602020645728, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.4876821041107178, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7129945755004883, + "num_tokens": 146291335.0, + "step": 5649 + }, + { + "epoch": 0.6204700197671865, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.59451961517334, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7130997180938721, + "num_tokens": 146311255.0, + "step": 5650 + }, + { + "epoch": 0.6205798374698002, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 2.483487367630005, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6958472728729248, + "num_tokens": 146335164.0, + "step": 5651 + }, + { + "epoch": 0.6206896551724138, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.7008113861083984, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7255041599273682, + "num_tokens": 146354287.0, + "step": 5652 + }, + { + "epoch": 0.6207994728750275, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.374394178390503, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7178508639335632, + "num_tokens": 146380335.0, + "step": 5653 + }, + { + "epoch": 0.6209092905776411, + "ewc_loss": 1.2636184692382812e-05, + "grad_norm": 2.5711984634399414, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6978327035903931, + "num_tokens": 146401453.0, + "step": 5654 + }, + { + "epoch": 0.6210191082802548, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2117414474487305, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6962621808052063, + "num_tokens": 146431847.0, + "step": 5655 + }, + { + "epoch": 0.6211289259828684, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4414374828338623, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.702350378036499, + "num_tokens": 146456537.0, + "step": 5656 + }, + { + "epoch": 0.6212387436854822, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4270172119140625, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7041898965835571, + "num_tokens": 146479809.0, + "step": 5657 + }, + { + "epoch": 0.6213485613880958, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2846617698669434, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6932139992713928, + "num_tokens": 146505965.0, + "step": 5658 + }, + { + "epoch": 0.6214583790907094, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3466222286224365, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7055617570877075, + "num_tokens": 146531551.0, + "step": 5659 + }, + { + "epoch": 0.6215681967933231, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.0666284561157227, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6780024766921997, + "num_tokens": 146563551.0, + "step": 5660 + }, + { + "epoch": 0.6216780144959367, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1938605308532715, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6876816749572754, + "num_tokens": 146591120.0, + "step": 5661 + }, + { + "epoch": 0.6217878321985504, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.653470754623413, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7163840532302856, + "num_tokens": 146611965.0, + "step": 5662 + }, + { + "epoch": 0.621897649901164, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.9613147974014282, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7029116749763489, + "num_tokens": 146648461.0, + "step": 5663 + }, + { + "epoch": 0.6220074676037777, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3336968421936035, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.697481095790863, + "num_tokens": 146676427.0, + "step": 5664 + }, + { + "epoch": 0.6221172853063914, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.320598602294922, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7173234820365906, + "num_tokens": 146700165.0, + "step": 5665 + }, + { + "epoch": 0.6222271030090051, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.372490882873535, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6985170245170593, + "num_tokens": 146724996.0, + "step": 5666 + }, + { + "epoch": 0.6223369207116187, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.401636838912964, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7098286747932434, + "num_tokens": 146750905.0, + "step": 5667 + }, + { + "epoch": 0.6224467384142324, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3548226356506348, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7025763988494873, + "num_tokens": 146776383.0, + "step": 5668 + }, + { + "epoch": 0.622556556116846, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.376321315765381, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7307069897651672, + "num_tokens": 146800241.0, + "step": 5669 + }, + { + "epoch": 0.6226663738194597, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1525816917419434, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.70564866065979, + "num_tokens": 146831318.0, + "step": 5670 + }, + { + "epoch": 0.6227761915220733, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.536289691925049, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6984115839004517, + "num_tokens": 146853834.0, + "step": 5671 + }, + { + "epoch": 0.6228860092246871, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.6704330444335938, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7315486073493958, + "num_tokens": 146872499.0, + "step": 5672 + }, + { + "epoch": 0.6229958269273007, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.407442808151245, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6809588074684143, + "num_tokens": 146897374.0, + "step": 5673 + }, + { + "epoch": 0.6231056446299144, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1205620765686035, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6916612982749939, + "num_tokens": 146927243.0, + "step": 5674 + }, + { + "epoch": 0.623215462332528, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.405360460281372, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7130150198936462, + "num_tokens": 146949952.0, + "step": 5675 + }, + { + "epoch": 0.6233252800351416, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2706856727600098, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.686776340007782, + "num_tokens": 146979182.0, + "step": 5676 + }, + { + "epoch": 0.6234350977377553, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1823654174804688, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.695059061050415, + "num_tokens": 147009131.0, + "step": 5677 + }, + { + "epoch": 0.6235449154403689, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.312429904937744, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6956751346588135, + "num_tokens": 147038561.0, + "step": 5678 + }, + { + "epoch": 0.6236547331429827, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.8254971504211426, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7280204892158508, + "num_tokens": 147055735.0, + "step": 5679 + }, + { + "epoch": 0.6237645508455963, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5056233406066895, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7127423286437988, + "num_tokens": 147076421.0, + "step": 5680 + }, + { + "epoch": 0.62387436854821, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4363489151000977, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7057193517684937, + "num_tokens": 147099457.0, + "step": 5681 + }, + { + "epoch": 0.6239841862508236, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.438966751098633, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6912639141082764, + "num_tokens": 147125823.0, + "step": 5682 + }, + { + "epoch": 0.6240940039534373, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.6977579593658447, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7045737504959106, + "num_tokens": 147154551.0, + "step": 5683 + }, + { + "epoch": 0.6242038216560509, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.525634288787842, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7009066939353943, + "num_tokens": 147177596.0, + "step": 5684 + }, + { + "epoch": 0.6243136393586646, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5113365650177, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6933903098106384, + "num_tokens": 147199678.0, + "step": 5685 + }, + { + "epoch": 0.6244234570612783, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3163444995880127, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7266677021980286, + "num_tokens": 147223851.0, + "step": 5686 + }, + { + "epoch": 0.624533274763892, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.167837142944336, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7173270583152771, + "num_tokens": 147251227.0, + "step": 5687 + }, + { + "epoch": 0.6246430924665056, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3546063899993896, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6841849684715271, + "num_tokens": 147277361.0, + "step": 5688 + }, + { + "epoch": 0.6247529101691193, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5104684829711914, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.700176477432251, + "num_tokens": 147297440.0, + "step": 5689 + }, + { + "epoch": 0.6248627278717329, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.312584638595581, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7061154246330261, + "num_tokens": 147324500.0, + "step": 5690 + }, + { + "epoch": 0.6249725455743466, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.253582000732422, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6916294693946838, + "num_tokens": 147352353.0, + "step": 5691 + }, + { + "epoch": 0.6250823632769602, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.8364510536193848, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.708859384059906, + "num_tokens": 147371328.0, + "step": 5692 + }, + { + "epoch": 0.6251921809795739, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.595639944076538, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7171152234077454, + "num_tokens": 147390884.0, + "step": 5693 + }, + { + "epoch": 0.6253019986821876, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.7048048973083496, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.718326210975647, + "num_tokens": 147410305.0, + "step": 5694 + }, + { + "epoch": 0.6254118163848013, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.437330484390259, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7031587362289429, + "num_tokens": 147434813.0, + "step": 5695 + }, + { + "epoch": 0.6255216340874149, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.164062976837158, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6984010934829712, + "num_tokens": 147466456.0, + "step": 5696 + }, + { + "epoch": 0.6256314517900285, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.791543483734131, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7127134799957275, + "num_tokens": 147485784.0, + "step": 5697 + }, + { + "epoch": 0.6257412694926422, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.297185182571411, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7240957617759705, + "num_tokens": 147509484.0, + "step": 5698 + }, + { + "epoch": 0.6258510871952558, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5192136764526367, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7030894756317139, + "num_tokens": 147530235.0, + "step": 5699 + }, + { + "epoch": 0.6259609048978695, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3583872318267822, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6850895881652832, + "num_tokens": 147556317.0, + "step": 5700 + }, + { + "epoch": 0.6260707226004832, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.370220422744751, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.7070456743240356, + "num_tokens": 147580805.0, + "step": 5701 + }, + { + "epoch": 0.6261805403030969, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3650481700897217, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7135655283927917, + "num_tokens": 147605713.0, + "step": 5702 + }, + { + "epoch": 0.6262903580057105, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.26926326751709, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6920248866081238, + "num_tokens": 147632335.0, + "step": 5703 + }, + { + "epoch": 0.6264001757083242, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.224213123321533, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6964424848556519, + "num_tokens": 147658970.0, + "step": 5704 + }, + { + "epoch": 0.6265099934109378, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.24994158744812, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6997742652893066, + "num_tokens": 147686015.0, + "step": 5705 + }, + { + "epoch": 0.6266198111135515, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5739755630493164, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7025456428527832, + "num_tokens": 147708507.0, + "step": 5706 + }, + { + "epoch": 0.6267296288161651, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4052717685699463, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6808938980102539, + "num_tokens": 147736217.0, + "step": 5707 + }, + { + "epoch": 0.6268394465187789, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4259631633758545, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7017600536346436, + "num_tokens": 147760731.0, + "step": 5708 + }, + { + "epoch": 0.6269492642213925, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4139175415039062, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6872681379318237, + "num_tokens": 147786980.0, + "step": 5709 + }, + { + "epoch": 0.6270590819240062, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.465269088745117, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7191578149795532, + "num_tokens": 147807755.0, + "step": 5710 + }, + { + "epoch": 0.6271688996266198, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.366908311843872, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6948925852775574, + "num_tokens": 147834274.0, + "step": 5711 + }, + { + "epoch": 0.6272787173292335, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2927937507629395, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7166785001754761, + "num_tokens": 147860854.0, + "step": 5712 + }, + { + "epoch": 0.6273885350318471, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.395498514175415, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6802921295166016, + "num_tokens": 147887369.0, + "step": 5713 + }, + { + "epoch": 0.6274983527344608, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2030956745147705, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7209821939468384, + "num_tokens": 147915582.0, + "step": 5714 + }, + { + "epoch": 0.6276081704370745, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3144171237945557, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6907625198364258, + "num_tokens": 147943778.0, + "step": 5715 + }, + { + "epoch": 0.6277179881396882, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4060442447662354, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6916910409927368, + "num_tokens": 147967687.0, + "step": 5716 + }, + { + "epoch": 0.6278278058423018, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.199315309524536, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6965463161468506, + "num_tokens": 147996893.0, + "step": 5717 + }, + { + "epoch": 0.6279376235449154, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1066670417785645, + "learning_rate": 1e-06, + "loss": 1.0992, + "mean_token_accuracy": 0.6788837909698486, + "num_tokens": 148030636.0, + "step": 5718 + }, + { + "epoch": 0.6280474412475291, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3605711460113525, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7106538414955139, + "num_tokens": 148054012.0, + "step": 5719 + }, + { + "epoch": 0.6281572589501427, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1099205017089844, + "learning_rate": 1e-06, + "loss": 1.1103, + "mean_token_accuracy": 0.676671028137207, + "num_tokens": 148085309.0, + "step": 5720 + }, + { + "epoch": 0.6282670766527564, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.455453395843506, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.69158935546875, + "num_tokens": 148109119.0, + "step": 5721 + }, + { + "epoch": 0.62837689435537, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.617398977279663, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6956298351287842, + "num_tokens": 148132385.0, + "step": 5722 + }, + { + "epoch": 0.6284867120579838, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.040515661239624, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6990566253662109, + "num_tokens": 148165336.0, + "step": 5723 + }, + { + "epoch": 0.6285965297605974, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.143911361694336, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6970700025558472, + "num_tokens": 148195345.0, + "step": 5724 + }, + { + "epoch": 0.6287063474632111, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.465090751647949, + "learning_rate": 1e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6835060715675354, + "num_tokens": 148218406.0, + "step": 5725 + }, + { + "epoch": 0.6288161651658247, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.101888656616211, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7049059867858887, + "num_tokens": 148249170.0, + "step": 5726 + }, + { + "epoch": 0.6289259828684384, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.6183276176452637, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7180712223052979, + "num_tokens": 148269680.0, + "step": 5727 + }, + { + "epoch": 0.629035800571052, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.501742362976074, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7229313850402832, + "num_tokens": 148291251.0, + "step": 5728 + }, + { + "epoch": 0.6291456182736657, + "ewc_loss": 1.2814998626708984e-05, + "grad_norm": 2.4050633907318115, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.709667444229126, + "num_tokens": 148318381.0, + "step": 5729 + }, + { + "epoch": 0.6292554359762794, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.219717502593994, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7102982401847839, + "num_tokens": 148345808.0, + "step": 5730 + }, + { + "epoch": 0.6293652536788931, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.510510206222534, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7214362621307373, + "num_tokens": 148367241.0, + "step": 5731 + }, + { + "epoch": 0.6294750713815067, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3140485286712646, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6821728348731995, + "num_tokens": 148394134.0, + "step": 5732 + }, + { + "epoch": 0.6295848890841204, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3510167598724365, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7104052305221558, + "num_tokens": 148418898.0, + "step": 5733 + }, + { + "epoch": 0.629694706786734, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3542518615722656, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6993964910507202, + "num_tokens": 148445447.0, + "step": 5734 + }, + { + "epoch": 0.6298045244893477, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.429112672805786, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6976354122161865, + "num_tokens": 148468281.0, + "step": 5735 + }, + { + "epoch": 0.6299143421919613, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.270233392715454, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7047867774963379, + "num_tokens": 148494389.0, + "step": 5736 + }, + { + "epoch": 0.630024159894575, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.331482172012329, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7039236426353455, + "num_tokens": 148521662.0, + "step": 5737 + }, + { + "epoch": 0.6301339775971887, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5860393047332764, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7145877480506897, + "num_tokens": 148541457.0, + "step": 5738 + }, + { + "epoch": 0.6302437952998023, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2003140449523926, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6860569715499878, + "num_tokens": 148569634.0, + "step": 5739 + }, + { + "epoch": 0.630353613002416, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.101942539215088, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6966499090194702, + "num_tokens": 148600795.0, + "step": 5740 + }, + { + "epoch": 0.6304634307050296, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2020058631896973, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7109748125076294, + "num_tokens": 148629434.0, + "step": 5741 + }, + { + "epoch": 0.6305732484076433, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1818439960479736, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.702601432800293, + "num_tokens": 148658548.0, + "step": 5742 + }, + { + "epoch": 0.6306830661102569, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3149006366729736, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7116470336914062, + "num_tokens": 148685449.0, + "step": 5743 + }, + { + "epoch": 0.6307928838128707, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.317748546600342, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6951390504837036, + "num_tokens": 148709970.0, + "step": 5744 + }, + { + "epoch": 0.6309027015154843, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2568368911743164, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7020363211631775, + "num_tokens": 148737062.0, + "step": 5745 + }, + { + "epoch": 0.631012519218098, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1460678577423096, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.703791618347168, + "num_tokens": 148767671.0, + "step": 5746 + }, + { + "epoch": 0.6311223369207116, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2383217811584473, + "learning_rate": 1e-06, + "loss": 1.1075, + "mean_token_accuracy": 0.6726141571998596, + "num_tokens": 148798121.0, + "step": 5747 + }, + { + "epoch": 0.6312321546233253, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.334935426712036, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7416788339614868, + "num_tokens": 148822303.0, + "step": 5748 + }, + { + "epoch": 0.6313419723259389, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.450904607772827, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7054775953292847, + "num_tokens": 148846091.0, + "step": 5749 + }, + { + "epoch": 0.6314517900285526, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1805953979492188, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.7028273940086365, + "num_tokens": 148875284.0, + "step": 5750 + }, + { + "epoch": 0.6315616077311663, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.378462791442871, + "learning_rate": 1e-06, + "loss": 1.1061, + "mean_token_accuracy": 0.6926872134208679, + "num_tokens": 148903453.0, + "step": 5751 + }, + { + "epoch": 0.63167142543378, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2733302116394043, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7002885341644287, + "num_tokens": 148930804.0, + "step": 5752 + }, + { + "epoch": 0.6317812431363936, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.072190284729004, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6789678931236267, + "num_tokens": 148961324.0, + "step": 5753 + }, + { + "epoch": 0.6318910608390073, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.2030646800994873, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7058888673782349, + "num_tokens": 148989833.0, + "step": 5754 + }, + { + "epoch": 0.6320008785416209, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.1372292041778564, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6972360610961914, + "num_tokens": 149021071.0, + "step": 5755 + }, + { + "epoch": 0.6321106962442345, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2943999767303467, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7149049043655396, + "num_tokens": 149044826.0, + "step": 5756 + }, + { + "epoch": 0.6322205139468482, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.201929807662964, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6968778371810913, + "num_tokens": 149071338.0, + "step": 5757 + }, + { + "epoch": 0.6323303316494618, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.234680414199829, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7061378955841064, + "num_tokens": 149100176.0, + "step": 5758 + }, + { + "epoch": 0.6324401493520756, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3668737411499023, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.689801812171936, + "num_tokens": 149126871.0, + "step": 5759 + }, + { + "epoch": 0.6325499670546892, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.585364818572998, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7005075216293335, + "num_tokens": 149149003.0, + "step": 5760 + }, + { + "epoch": 0.6326597847573029, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5651631355285645, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6788785457611084, + "num_tokens": 149172255.0, + "step": 5761 + }, + { + "epoch": 0.6327696024599165, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.611966133117676, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.691382884979248, + "num_tokens": 149194487.0, + "step": 5762 + }, + { + "epoch": 0.6328794201625302, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3408336639404297, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6990715265274048, + "num_tokens": 149219638.0, + "step": 5763 + }, + { + "epoch": 0.6329892378651438, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.8597412109375, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6949851512908936, + "num_tokens": 149238091.0, + "step": 5764 + }, + { + "epoch": 0.6330990555677575, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.414705514907837, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7039807438850403, + "num_tokens": 149261916.0, + "step": 5765 + }, + { + "epoch": 0.6332088732703712, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4708938598632812, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7239803075790405, + "num_tokens": 149283449.0, + "step": 5766 + }, + { + "epoch": 0.6333186909729849, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.281327247619629, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.70805424451828, + "num_tokens": 149309221.0, + "step": 5767 + }, + { + "epoch": 0.6334285086755985, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1422181129455566, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6879003643989563, + "num_tokens": 149338800.0, + "step": 5768 + }, + { + "epoch": 0.6335383263782122, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3693044185638428, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7018794417381287, + "num_tokens": 149367427.0, + "step": 5769 + }, + { + "epoch": 0.6336481440808258, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.415168523788452, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.704287052154541, + "num_tokens": 149391851.0, + "step": 5770 + }, + { + "epoch": 0.6337579617834395, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.423356056213379, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7124003171920776, + "num_tokens": 149415251.0, + "step": 5771 + }, + { + "epoch": 0.6338677794860531, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2087409496307373, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6816548109054565, + "num_tokens": 149444650.0, + "step": 5772 + }, + { + "epoch": 0.6339775971886669, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.071936845779419, + "learning_rate": 1e-06, + "loss": 1.1155, + "mean_token_accuracy": 0.6713598370552063, + "num_tokens": 149475849.0, + "step": 5773 + }, + { + "epoch": 0.6340874148912805, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.228166103363037, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6900745630264282, + "num_tokens": 149503813.0, + "step": 5774 + }, + { + "epoch": 0.6341972325938942, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.446564197540283, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6967934370040894, + "num_tokens": 149527726.0, + "step": 5775 + }, + { + "epoch": 0.6343070502965078, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.104675531387329, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6901528835296631, + "num_tokens": 149560705.0, + "step": 5776 + }, + { + "epoch": 0.6344168679991214, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.195901870727539, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7016540765762329, + "num_tokens": 149587557.0, + "step": 5777 + }, + { + "epoch": 0.6345266857017351, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.3103954792022705, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6934940814971924, + "num_tokens": 149613308.0, + "step": 5778 + }, + { + "epoch": 0.6346365034043487, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.4093902111053467, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.6734725832939148, + "num_tokens": 149639459.0, + "step": 5779 + }, + { + "epoch": 0.6347463211069625, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.172990560531616, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6995403170585632, + "num_tokens": 149669146.0, + "step": 5780 + }, + { + "epoch": 0.6348561388095761, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.273583173751831, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7071130275726318, + "num_tokens": 149695518.0, + "step": 5781 + }, + { + "epoch": 0.6349659565121898, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.605591297149658, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7137664556503296, + "num_tokens": 149718383.0, + "step": 5782 + }, + { + "epoch": 0.6350757742148034, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.5415494441986084, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7034429311752319, + "num_tokens": 149741179.0, + "step": 5783 + }, + { + "epoch": 0.6351855919174171, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.6007230281829834, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.69527268409729, + "num_tokens": 149763981.0, + "step": 5784 + }, + { + "epoch": 0.6352954096200307, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.135018825531006, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6945697069168091, + "num_tokens": 149794146.0, + "step": 5785 + }, + { + "epoch": 0.6354052273226444, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.32930326461792, + "learning_rate": 1e-06, + "loss": 1.0912, + "mean_token_accuracy": 0.6913089752197266, + "num_tokens": 149819811.0, + "step": 5786 + }, + { + "epoch": 0.635515045025258, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2002601623535156, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6937777996063232, + "num_tokens": 149848238.0, + "step": 5787 + }, + { + "epoch": 0.6356248627278718, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.304159641265869, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7044228315353394, + "num_tokens": 149874801.0, + "step": 5788 + }, + { + "epoch": 0.6357346804304854, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.508502960205078, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7083919048309326, + "num_tokens": 149897048.0, + "step": 5789 + }, + { + "epoch": 0.6358444981330991, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.463728427886963, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6886307597160339, + "num_tokens": 149919727.0, + "step": 5790 + }, + { + "epoch": 0.6359543158357127, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3199446201324463, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.701064944267273, + "num_tokens": 149945061.0, + "step": 5791 + }, + { + "epoch": 0.6360641335383264, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.570323944091797, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7160602807998657, + "num_tokens": 149966169.0, + "step": 5792 + }, + { + "epoch": 0.63617395124094, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.265357255935669, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7205208539962769, + "num_tokens": 149992607.0, + "step": 5793 + }, + { + "epoch": 0.6362837689435537, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.406130790710449, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6817533373832703, + "num_tokens": 150019636.0, + "step": 5794 + }, + { + "epoch": 0.6363935866461674, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2729344367980957, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6968400478363037, + "num_tokens": 150046268.0, + "step": 5795 + }, + { + "epoch": 0.636503404348781, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.352071762084961, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6876224279403687, + "num_tokens": 150072283.0, + "step": 5796 + }, + { + "epoch": 0.6366132220513947, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2238657474517822, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7012048363685608, + "num_tokens": 150099010.0, + "step": 5797 + }, + { + "epoch": 0.6367230397540083, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3251657485961914, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7064651846885681, + "num_tokens": 150124807.0, + "step": 5798 + }, + { + "epoch": 0.636832857456622, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.361259698867798, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7065649032592773, + "num_tokens": 150153282.0, + "step": 5799 + }, + { + "epoch": 0.6369426751592356, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5418009757995605, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6915632486343384, + "num_tokens": 150175704.0, + "step": 5800 + }, + { + "epoch": 0.6370524928618493, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3023061752319336, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7127927541732788, + "num_tokens": 150201266.0, + "step": 5801 + }, + { + "epoch": 0.637162310564463, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4308509826660156, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7113417387008667, + "num_tokens": 150223486.0, + "step": 5802 + }, + { + "epoch": 0.6372721282670767, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.299152374267578, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7003783583641052, + "num_tokens": 150249127.0, + "step": 5803 + }, + { + "epoch": 0.6373819459696903, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.369323253631592, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7199216485023499, + "num_tokens": 150272652.0, + "step": 5804 + }, + { + "epoch": 0.637491763672304, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4406626224517822, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7155172824859619, + "num_tokens": 150296163.0, + "step": 5805 + }, + { + "epoch": 0.6376015813749176, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2368855476379395, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6891193389892578, + "num_tokens": 150326248.0, + "step": 5806 + }, + { + "epoch": 0.6377113990775313, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2686781883239746, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7120668888092041, + "num_tokens": 150350743.0, + "step": 5807 + }, + { + "epoch": 0.6378212167801449, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3631792068481445, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7220079898834229, + "num_tokens": 150374351.0, + "step": 5808 + }, + { + "epoch": 0.6379310344827587, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.164585590362549, + "learning_rate": 1e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6806213855743408, + "num_tokens": 150403427.0, + "step": 5809 + }, + { + "epoch": 0.6380408521853723, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4378316402435303, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7168400883674622, + "num_tokens": 150425915.0, + "step": 5810 + }, + { + "epoch": 0.638150669887986, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4496445655822754, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7118694186210632, + "num_tokens": 150448372.0, + "step": 5811 + }, + { + "epoch": 0.6382604875905996, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4005110263824463, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7187583446502686, + "num_tokens": 150470554.0, + "step": 5812 + }, + { + "epoch": 0.6383703052932133, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5249054431915283, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6821917295455933, + "num_tokens": 150493751.0, + "step": 5813 + }, + { + "epoch": 0.6384801229958269, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4458394050598145, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6913846731185913, + "num_tokens": 150518023.0, + "step": 5814 + }, + { + "epoch": 0.6385899406984406, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.0623068809509277, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7203699350357056, + "num_tokens": 150549241.0, + "step": 5815 + }, + { + "epoch": 0.6386997584010542, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1591451168060303, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.7016042470932007, + "num_tokens": 150580556.0, + "step": 5816 + }, + { + "epoch": 0.638809576103668, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.182135581970215, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6876639127731323, + "num_tokens": 150611486.0, + "step": 5817 + }, + { + "epoch": 0.6389193938062816, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.395084857940674, + "learning_rate": 1e-06, + "loss": 1.1005, + "mean_token_accuracy": 0.6737158298492432, + "num_tokens": 150636022.0, + "step": 5818 + }, + { + "epoch": 0.6390292115088952, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4997525215148926, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6953722238540649, + "num_tokens": 150659070.0, + "step": 5819 + }, + { + "epoch": 0.6391390292115089, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.293869972229004, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7292571067810059, + "num_tokens": 150685946.0, + "step": 5820 + }, + { + "epoch": 0.6392488469141225, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4789891242980957, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7105189561843872, + "num_tokens": 150710552.0, + "step": 5821 + }, + { + "epoch": 0.6393586646167362, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3007972240448, + "learning_rate": 1e-06, + "loss": 1.1098, + "mean_token_accuracy": 0.6846964955329895, + "num_tokens": 150739809.0, + "step": 5822 + }, + { + "epoch": 0.6394684823193498, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.6207940578460693, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7053579688072205, + "num_tokens": 150761127.0, + "step": 5823 + }, + { + "epoch": 0.6395783000219636, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3506152629852295, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.756203293800354, + "num_tokens": 150786097.0, + "step": 5824 + }, + { + "epoch": 0.6396881177245772, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.596959352493286, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6905484199523926, + "num_tokens": 150807894.0, + "step": 5825 + }, + { + "epoch": 0.6397979354271909, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4355549812316895, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7064800262451172, + "num_tokens": 150831323.0, + "step": 5826 + }, + { + "epoch": 0.6399077531298045, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.425511360168457, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6950932145118713, + "num_tokens": 150858194.0, + "step": 5827 + }, + { + "epoch": 0.6400175708324182, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3542113304138184, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7061384320259094, + "num_tokens": 150882564.0, + "step": 5828 + }, + { + "epoch": 0.6401273885350318, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.388009548187256, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7208660244941711, + "num_tokens": 150907187.0, + "step": 5829 + }, + { + "epoch": 0.6402372062376455, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3248703479766846, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6833325028419495, + "num_tokens": 150934895.0, + "step": 5830 + }, + { + "epoch": 0.6403470239402592, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.428617000579834, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6938189268112183, + "num_tokens": 150958210.0, + "step": 5831 + }, + { + "epoch": 0.6404568416428729, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4066762924194336, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7324615120887756, + "num_tokens": 150980567.0, + "step": 5832 + }, + { + "epoch": 0.6405666593454865, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1798675060272217, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6910057067871094, + "num_tokens": 151009372.0, + "step": 5833 + }, + { + "epoch": 0.6406764770481002, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.438314914703369, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.694153368473053, + "num_tokens": 151035712.0, + "step": 5834 + }, + { + "epoch": 0.6407862947507138, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2848117351531982, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.722234845161438, + "num_tokens": 151061318.0, + "step": 5835 + }, + { + "epoch": 0.6408961124533274, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3901236057281494, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7015150785446167, + "num_tokens": 151087793.0, + "step": 5836 + }, + { + "epoch": 0.6410059301559411, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4583892822265625, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6932688355445862, + "num_tokens": 151111689.0, + "step": 5837 + }, + { + "epoch": 0.6411157478585549, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.110417366027832, + "learning_rate": 1e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.6842536926269531, + "num_tokens": 151144752.0, + "step": 5838 + }, + { + "epoch": 0.6412255655611685, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.15309476852417, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.707841694355011, + "num_tokens": 151175374.0, + "step": 5839 + }, + { + "epoch": 0.6413353832637821, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.0516579151153564, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6940584182739258, + "num_tokens": 151207581.0, + "step": 5840 + }, + { + "epoch": 0.6414452009663958, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3510327339172363, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7205945253372192, + "num_tokens": 151231272.0, + "step": 5841 + }, + { + "epoch": 0.6415550186690094, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2174158096313477, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6949142217636108, + "num_tokens": 151259848.0, + "step": 5842 + }, + { + "epoch": 0.6416648363716231, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.446424722671509, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7152546048164368, + "num_tokens": 151283535.0, + "step": 5843 + }, + { + "epoch": 0.6417746540742367, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4932713508605957, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7149165272712708, + "num_tokens": 151307785.0, + "step": 5844 + }, + { + "epoch": 0.6418844717768504, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.341935157775879, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6877610087394714, + "num_tokens": 151334314.0, + "step": 5845 + }, + { + "epoch": 0.6419942894794641, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.656829357147217, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7155717015266418, + "num_tokens": 151355616.0, + "step": 5846 + }, + { + "epoch": 0.6421041071820778, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.476897954940796, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7273343801498413, + "num_tokens": 151377054.0, + "step": 5847 + }, + { + "epoch": 0.6422139248846914, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3050272464752197, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7084787487983704, + "num_tokens": 151402658.0, + "step": 5848 + }, + { + "epoch": 0.6423237425873051, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3284032344818115, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7032995820045471, + "num_tokens": 151429171.0, + "step": 5849 + }, + { + "epoch": 0.6424335602899187, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2405083179473877, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7004138231277466, + "num_tokens": 151458553.0, + "step": 5850 + }, + { + "epoch": 0.6425433779925324, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.490464925765991, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7132753133773804, + "num_tokens": 151481152.0, + "step": 5851 + }, + { + "epoch": 0.642653195695146, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.189272165298462, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7051867842674255, + "num_tokens": 151511318.0, + "step": 5852 + }, + { + "epoch": 0.6427630133977598, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4948291778564453, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7092092037200928, + "num_tokens": 151534627.0, + "step": 5853 + }, + { + "epoch": 0.6428728311003734, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3511664867401123, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6971914768218994, + "num_tokens": 151560458.0, + "step": 5854 + }, + { + "epoch": 0.6429826488029871, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5196492671966553, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6999448537826538, + "num_tokens": 151582610.0, + "step": 5855 + }, + { + "epoch": 0.6430924665056007, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4104111194610596, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7177799940109253, + "num_tokens": 151608983.0, + "step": 5856 + }, + { + "epoch": 0.6432022842082143, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.7293524742126465, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7222322821617126, + "num_tokens": 151627493.0, + "step": 5857 + }, + { + "epoch": 0.643312101910828, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.9993858337402344, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7390148043632507, + "num_tokens": 151661357.0, + "step": 5858 + }, + { + "epoch": 0.6434219196134416, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.086888551712036, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6941530704498291, + "num_tokens": 151696267.0, + "step": 5859 + }, + { + "epoch": 0.6435317373160554, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1841959953308105, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.700194239616394, + "num_tokens": 151724986.0, + "step": 5860 + }, + { + "epoch": 0.643641555018669, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2218356132507324, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7182037830352783, + "num_tokens": 151752918.0, + "step": 5861 + }, + { + "epoch": 0.6437513727212827, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.339709997177124, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7208380699157715, + "num_tokens": 151777243.0, + "step": 5862 + }, + { + "epoch": 0.6438611904238963, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3528354167938232, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6968489289283752, + "num_tokens": 151805063.0, + "step": 5863 + }, + { + "epoch": 0.64397100812651, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3833484649658203, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7092175483703613, + "num_tokens": 151831224.0, + "step": 5864 + }, + { + "epoch": 0.6440808258291236, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.078920841217041, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6768360733985901, + "num_tokens": 151862421.0, + "step": 5865 + }, + { + "epoch": 0.6441906435317373, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.439117193222046, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.694706380367279, + "num_tokens": 151888055.0, + "step": 5866 + }, + { + "epoch": 0.644300461234351, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5618207454681396, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7254195213317871, + "num_tokens": 151907627.0, + "step": 5867 + }, + { + "epoch": 0.6444102789369647, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4682297706604004, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7028964161872864, + "num_tokens": 151931472.0, + "step": 5868 + }, + { + "epoch": 0.6445200966395783, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3825459480285645, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7140454053878784, + "num_tokens": 151958960.0, + "step": 5869 + }, + { + "epoch": 0.644629914342192, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1953063011169434, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7075363993644714, + "num_tokens": 151986428.0, + "step": 5870 + }, + { + "epoch": 0.6447397320448056, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5336389541625977, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7042713761329651, + "num_tokens": 152009759.0, + "step": 5871 + }, + { + "epoch": 0.6448495497474193, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 6.983945369720459, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6842461228370667, + "num_tokens": 152039419.0, + "step": 5872 + }, + { + "epoch": 0.6449593674500329, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4289374351501465, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7094532251358032, + "num_tokens": 152061413.0, + "step": 5873 + }, + { + "epoch": 0.6450691851526466, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.449843645095825, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6851296424865723, + "num_tokens": 152086909.0, + "step": 5874 + }, + { + "epoch": 0.6451790028552603, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.552727222442627, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.709152102470398, + "num_tokens": 152110852.0, + "step": 5875 + }, + { + "epoch": 0.645288820557874, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4687068462371826, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6880874633789062, + "num_tokens": 152139170.0, + "step": 5876 + }, + { + "epoch": 0.6453986382604876, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5681440830230713, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6930221319198608, + "num_tokens": 152162709.0, + "step": 5877 + }, + { + "epoch": 0.6455084559631012, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.327465295791626, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.692308247089386, + "num_tokens": 152190478.0, + "step": 5878 + }, + { + "epoch": 0.6456182736657149, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.245497226715088, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6997066736221313, + "num_tokens": 152218556.0, + "step": 5879 + }, + { + "epoch": 0.6457280913683285, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4468438625335693, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7084672451019287, + "num_tokens": 152242928.0, + "step": 5880 + }, + { + "epoch": 0.6458379090709422, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4352784156799316, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.708146870136261, + "num_tokens": 152266912.0, + "step": 5881 + }, + { + "epoch": 0.6459477267735559, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.18554425239563, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6948860287666321, + "num_tokens": 152297859.0, + "step": 5882 + }, + { + "epoch": 0.6460575444761696, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.6336753368377686, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7050557136535645, + "num_tokens": 152320490.0, + "step": 5883 + }, + { + "epoch": 0.6461673621787832, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.330348253250122, + "learning_rate": 1e-06, + "loss": 1.0726, + "mean_token_accuracy": 0.6884286999702454, + "num_tokens": 152346999.0, + "step": 5884 + }, + { + "epoch": 0.6462771798813969, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2525720596313477, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7036643028259277, + "num_tokens": 152377770.0, + "step": 5885 + }, + { + "epoch": 0.6463869975840105, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2290501594543457, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6811294555664062, + "num_tokens": 152408079.0, + "step": 5886 + }, + { + "epoch": 0.6464968152866242, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.6072254180908203, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6938439011573792, + "num_tokens": 152430651.0, + "step": 5887 + }, + { + "epoch": 0.6466066329892378, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.364454984664917, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6924622058868408, + "num_tokens": 152459557.0, + "step": 5888 + }, + { + "epoch": 0.6467164506918516, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 3.7272820472717285, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7291351556777954, + "num_tokens": 152478085.0, + "step": 5889 + }, + { + "epoch": 0.6468262683944652, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4045562744140625, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7227874994277954, + "num_tokens": 152506522.0, + "step": 5890 + }, + { + "epoch": 0.6469360860970789, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3113644123077393, + "learning_rate": 1e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.6784564256668091, + "num_tokens": 152535998.0, + "step": 5891 + }, + { + "epoch": 0.6470459037996925, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3100693225860596, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6908536553382874, + "num_tokens": 152562365.0, + "step": 5892 + }, + { + "epoch": 0.6471557215023062, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1189825534820557, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6841669082641602, + "num_tokens": 152591858.0, + "step": 5893 + }, + { + "epoch": 0.6472655392049198, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.397165060043335, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.726346492767334, + "num_tokens": 152616172.0, + "step": 5894 + }, + { + "epoch": 0.6473753569075335, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 7.059435844421387, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6961570978164673, + "num_tokens": 152639601.0, + "step": 5895 + }, + { + "epoch": 0.6474851746101472, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.355436086654663, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6745424866676331, + "num_tokens": 152667239.0, + "step": 5896 + }, + { + "epoch": 0.6475949923127609, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.160385847091675, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6841615438461304, + "num_tokens": 152697231.0, + "step": 5897 + }, + { + "epoch": 0.6477048100153745, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.186131000518799, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6929875612258911, + "num_tokens": 152728080.0, + "step": 5898 + }, + { + "epoch": 0.6478146277179881, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.9749780893325806, + "learning_rate": 1e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6830179691314697, + "num_tokens": 152762345.0, + "step": 5899 + }, + { + "epoch": 0.6479244454206018, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2148380279541016, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6929367780685425, + "num_tokens": 152789650.0, + "step": 5900 + }, + { + "epoch": 0.6480342631232154, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.144000291824341, + "learning_rate": 1e-06, + "loss": 1.1242, + "mean_token_accuracy": 0.6763910055160522, + "num_tokens": 152821126.0, + "step": 5901 + }, + { + "epoch": 0.6481440808258291, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4539706707000732, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6971443891525269, + "num_tokens": 152846609.0, + "step": 5902 + }, + { + "epoch": 0.6482538985284428, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2607908248901367, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6973723769187927, + "num_tokens": 152872815.0, + "step": 5903 + }, + { + "epoch": 0.6483637162310565, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.274907112121582, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6888636946678162, + "num_tokens": 152900870.0, + "step": 5904 + }, + { + "epoch": 0.6484735339336701, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.308616876602173, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7182794809341431, + "num_tokens": 152926475.0, + "step": 5905 + }, + { + "epoch": 0.6485833516362838, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.0709164142608643, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6901500225067139, + "num_tokens": 152957867.0, + "step": 5906 + }, + { + "epoch": 0.6486931693388974, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.365483045578003, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.712494969367981, + "num_tokens": 152982578.0, + "step": 5907 + }, + { + "epoch": 0.6488029870415111, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4225072860717773, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7147863507270813, + "num_tokens": 153006120.0, + "step": 5908 + }, + { + "epoch": 0.6489128047441247, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.267343044281006, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7268478870391846, + "num_tokens": 153032777.0, + "step": 5909 + }, + { + "epoch": 0.6490226224467384, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.342031955718994, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6859420537948608, + "num_tokens": 153061135.0, + "step": 5910 + }, + { + "epoch": 0.6491324401493521, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1867949962615967, + "learning_rate": 1e-06, + "loss": 1.135, + "mean_token_accuracy": 0.659052848815918, + "num_tokens": 153094626.0, + "step": 5911 + }, + { + "epoch": 0.6492422578519658, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5080068111419678, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6996078491210938, + "num_tokens": 153120192.0, + "step": 5912 + }, + { + "epoch": 0.6493520755545794, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2679803371429443, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7051562070846558, + "num_tokens": 153146784.0, + "step": 5913 + }, + { + "epoch": 0.6494618932571931, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4523677825927734, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.718855082988739, + "num_tokens": 153170056.0, + "step": 5914 + }, + { + "epoch": 0.6495717109598067, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.289156198501587, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7197647094726562, + "num_tokens": 153195030.0, + "step": 5915 + }, + { + "epoch": 0.6496815286624203, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3092527389526367, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7125415802001953, + "num_tokens": 153221898.0, + "step": 5916 + }, + { + "epoch": 0.649791346365034, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.0696523189544678, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6991336345672607, + "num_tokens": 153255269.0, + "step": 5917 + }, + { + "epoch": 0.6499011640676478, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3402907848358154, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7177417278289795, + "num_tokens": 153281596.0, + "step": 5918 + }, + { + "epoch": 0.6500109817702614, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.726806879043579, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7163773775100708, + "num_tokens": 153299996.0, + "step": 5919 + }, + { + "epoch": 0.650120799472875, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3381643295288086, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.702316164970398, + "num_tokens": 153324129.0, + "step": 5920 + }, + { + "epoch": 0.6502306171754887, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4704062938690186, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6982479095458984, + "num_tokens": 153348145.0, + "step": 5921 + }, + { + "epoch": 0.6503404348781023, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.4864349365234375, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7111261487007141, + "num_tokens": 153371737.0, + "step": 5922 + }, + { + "epoch": 0.650450252580716, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1877787113189697, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7000397443771362, + "num_tokens": 153402899.0, + "step": 5923 + }, + { + "epoch": 0.6505600702833296, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2911739349365234, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7284920811653137, + "num_tokens": 153428385.0, + "step": 5924 + }, + { + "epoch": 0.6506698879859434, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.114711284637451, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6847672462463379, + "num_tokens": 153461679.0, + "step": 5925 + }, + { + "epoch": 0.650779705688557, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.259377956390381, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6968065500259399, + "num_tokens": 153491875.0, + "step": 5926 + }, + { + "epoch": 0.6508895233911707, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.571436882019043, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6971539258956909, + "num_tokens": 153512525.0, + "step": 5927 + }, + { + "epoch": 0.6509993410937843, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3711447715759277, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7043206095695496, + "num_tokens": 153537574.0, + "step": 5928 + }, + { + "epoch": 0.651109158796398, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.3842413425445557, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.71234530210495, + "num_tokens": 153561400.0, + "step": 5929 + }, + { + "epoch": 0.6512189764990116, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.346599817276001, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6921226978302002, + "num_tokens": 153588151.0, + "step": 5930 + }, + { + "epoch": 0.6513287942016253, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.2431447505950928, + "learning_rate": 1e-06, + "loss": 1.1467, + "mean_token_accuracy": 0.6671895980834961, + "num_tokens": 153619776.0, + "step": 5931 + }, + { + "epoch": 0.651438611904239, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.722695827484131, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7022813558578491, + "num_tokens": 153639527.0, + "step": 5932 + }, + { + "epoch": 0.6515484296068527, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.1658313274383545, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7219855785369873, + "num_tokens": 153667505.0, + "step": 5933 + }, + { + "epoch": 0.6516582473094663, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.246992588043213, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6948848962783813, + "num_tokens": 153695390.0, + "step": 5934 + }, + { + "epoch": 0.65176806501208, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5959115028381348, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7114612460136414, + "num_tokens": 153717638.0, + "step": 5935 + }, + { + "epoch": 0.6518778827146936, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.5666866302490234, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7109982371330261, + "num_tokens": 153738948.0, + "step": 5936 + }, + { + "epoch": 0.6519877004173072, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.824957847595215, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.726922333240509, + "num_tokens": 153755757.0, + "step": 5937 + }, + { + "epoch": 0.6520975181199209, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 2.318753242492676, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6829149127006531, + "num_tokens": 153782883.0, + "step": 5938 + }, + { + "epoch": 0.6522073358225345, + "ewc_loss": 1.2814998626708984e-05, + "grad_norm": 2.3896594047546387, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7168304920196533, + "num_tokens": 153809530.0, + "step": 5939 + }, + { + "epoch": 0.6523171535251483, + "ewc_loss": 1.2814998626708984e-05, + "grad_norm": 2.446376085281372, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7319756746292114, + "num_tokens": 153833445.0, + "step": 5940 + }, + { + "epoch": 0.6524269712277619, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.209960460662842, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6910538673400879, + "num_tokens": 153863494.0, + "step": 5941 + }, + { + "epoch": 0.6525367889303756, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.303802251815796, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7002901434898376, + "num_tokens": 153889125.0, + "step": 5942 + }, + { + "epoch": 0.6526466066329892, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.6017990112304688, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7014930844306946, + "num_tokens": 153911542.0, + "step": 5943 + }, + { + "epoch": 0.6527564243356029, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.197659492492676, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6846467852592468, + "num_tokens": 153942533.0, + "step": 5944 + }, + { + "epoch": 0.6528662420382165, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.3904008865356445, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6936781406402588, + "num_tokens": 153967534.0, + "step": 5945 + }, + { + "epoch": 0.6529760597408302, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.190641164779663, + "learning_rate": 1e-06, + "loss": 1.1225, + "mean_token_accuracy": 0.6740169525146484, + "num_tokens": 153998781.0, + "step": 5946 + }, + { + "epoch": 0.6530858774434439, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.714365005493164, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6954646706581116, + "num_tokens": 154019262.0, + "step": 5947 + }, + { + "epoch": 0.6531956951460576, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.221609115600586, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6974440813064575, + "num_tokens": 154047477.0, + "step": 5948 + }, + { + "epoch": 0.6533055128486712, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4687952995300293, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7189409732818604, + "num_tokens": 154070596.0, + "step": 5949 + }, + { + "epoch": 0.6534153305512849, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 3.089662790298462, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.711719274520874, + "num_tokens": 154088080.0, + "step": 5950 + }, + { + "epoch": 0.6535251482538985, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4122798442840576, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7162063717842102, + "num_tokens": 154111183.0, + "step": 5951 + }, + { + "epoch": 0.6536349659565122, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4073731899261475, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.701717734336853, + "num_tokens": 154133988.0, + "step": 5952 + }, + { + "epoch": 0.6537447836591258, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.1298441886901855, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7057487964630127, + "num_tokens": 154162949.0, + "step": 5953 + }, + { + "epoch": 0.6538546013617396, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.3145782947540283, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7014840245246887, + "num_tokens": 154191471.0, + "step": 5954 + }, + { + "epoch": 0.6539644190643532, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.3648884296417236, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7073551416397095, + "num_tokens": 154214934.0, + "step": 5955 + }, + { + "epoch": 0.6540742367669669, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2064552307128906, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6993860006332397, + "num_tokens": 154242775.0, + "step": 5956 + }, + { + "epoch": 0.6541840544695805, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2201671600341797, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6823774576187134, + "num_tokens": 154271659.0, + "step": 5957 + }, + { + "epoch": 0.6542938721721941, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.6702327728271484, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7341400384902954, + "num_tokens": 154291248.0, + "step": 5958 + }, + { + "epoch": 0.6544036898748078, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2411818504333496, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6982830166816711, + "num_tokens": 154319496.0, + "step": 5959 + }, + { + "epoch": 0.6545135075774214, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.1852571964263916, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7057415246963501, + "num_tokens": 154348439.0, + "step": 5960 + }, + { + "epoch": 0.6546233252800352, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4344260692596436, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7258166074752808, + "num_tokens": 154372375.0, + "step": 5961 + }, + { + "epoch": 0.6547331429826488, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2986295223236084, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.715630829334259, + "num_tokens": 154397081.0, + "step": 5962 + }, + { + "epoch": 0.6548429606852625, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.3419299125671387, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6980568766593933, + "num_tokens": 154425629.0, + "step": 5963 + }, + { + "epoch": 0.6549527783878761, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.6731650829315186, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7267412543296814, + "num_tokens": 154447369.0, + "step": 5964 + }, + { + "epoch": 0.6550625960904898, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.597400426864624, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6959523558616638, + "num_tokens": 154469352.0, + "step": 5965 + }, + { + "epoch": 0.6551724137931034, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.26596999168396, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6890709400177002, + "num_tokens": 154498264.0, + "step": 5966 + }, + { + "epoch": 0.6552822314957171, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.5635316371917725, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7286352515220642, + "num_tokens": 154518816.0, + "step": 5967 + }, + { + "epoch": 0.6553920491983307, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.065423011779785, + "learning_rate": 1e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.680825412273407, + "num_tokens": 154553216.0, + "step": 5968 + }, + { + "epoch": 0.6555018669009445, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.550018072128296, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7349802255630493, + "num_tokens": 154573981.0, + "step": 5969 + }, + { + "epoch": 0.6556116846035581, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.252617359161377, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6977203488349915, + "num_tokens": 154602589.0, + "step": 5970 + }, + { + "epoch": 0.6557215023061718, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.7485971450805664, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7021094560623169, + "num_tokens": 154621052.0, + "step": 5971 + }, + { + "epoch": 0.6558313200087854, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.33914852142334, + "learning_rate": 1e-06, + "loss": 1.0801, + "mean_token_accuracy": 0.6841872930526733, + "num_tokens": 154647699.0, + "step": 5972 + }, + { + "epoch": 0.6559411377113991, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4394378662109375, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7176637053489685, + "num_tokens": 154669577.0, + "step": 5973 + }, + { + "epoch": 0.6560509554140127, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.339124917984009, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7116624712944031, + "num_tokens": 154695662.0, + "step": 5974 + }, + { + "epoch": 0.6561607731166264, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 8.561100006103516, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.690459132194519, + "num_tokens": 154723545.0, + "step": 5975 + }, + { + "epoch": 0.6562705908192401, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2288825511932373, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6853632926940918, + "num_tokens": 154755534.0, + "step": 5976 + }, + { + "epoch": 0.6563804085218538, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4116837978363037, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6957710981369019, + "num_tokens": 154781348.0, + "step": 5977 + }, + { + "epoch": 0.6564902262244674, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.273949384689331, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6946009397506714, + "num_tokens": 154810297.0, + "step": 5978 + }, + { + "epoch": 0.656600043927081, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.5147945880889893, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7102633118629456, + "num_tokens": 154831098.0, + "step": 5979 + }, + { + "epoch": 0.6567098616296947, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.3600521087646484, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6802974939346313, + "num_tokens": 154857877.0, + "step": 5980 + }, + { + "epoch": 0.6568196793323083, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 3.195060968399048, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7106976509094238, + "num_tokens": 154883350.0, + "step": 5981 + }, + { + "epoch": 0.656929497034922, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.308429718017578, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6916494369506836, + "num_tokens": 154909297.0, + "step": 5982 + }, + { + "epoch": 0.6570393147375357, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2095649242401123, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.707645058631897, + "num_tokens": 154936292.0, + "step": 5983 + }, + { + "epoch": 0.6571491324401494, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.068131923675537, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6814886927604675, + "num_tokens": 154970211.0, + "step": 5984 + }, + { + "epoch": 0.657258950142763, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.69474458694458, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7155972719192505, + "num_tokens": 154989482.0, + "step": 5985 + }, + { + "epoch": 0.6573687678453767, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2212696075439453, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7031036615371704, + "num_tokens": 155017921.0, + "step": 5986 + }, + { + "epoch": 0.6574785855479903, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.387122392654419, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6919289827346802, + "num_tokens": 155042883.0, + "step": 5987 + }, + { + "epoch": 0.657588403250604, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.134995937347412, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6904964447021484, + "num_tokens": 155071816.0, + "step": 5988 + }, + { + "epoch": 0.6576982209532176, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.427485227584839, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6928972005844116, + "num_tokens": 155098747.0, + "step": 5989 + }, + { + "epoch": 0.6578080386558314, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.610535144805908, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.717400074005127, + "num_tokens": 155119942.0, + "step": 5990 + }, + { + "epoch": 0.657917856358445, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.2666265964508057, + "learning_rate": 1e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6810914278030396, + "num_tokens": 155146995.0, + "step": 5991 + }, + { + "epoch": 0.6580276740610587, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.164215326309204, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6890518665313721, + "num_tokens": 155179232.0, + "step": 5992 + }, + { + "epoch": 0.6581374917636723, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4800915718078613, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.700070321559906, + "num_tokens": 155202130.0, + "step": 5993 + }, + { + "epoch": 0.658247309466286, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4389007091522217, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6843778491020203, + "num_tokens": 155227196.0, + "step": 5994 + }, + { + "epoch": 0.6583571271688996, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.4014010429382324, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.686124861240387, + "num_tokens": 155254350.0, + "step": 5995 + }, + { + "epoch": 0.6584669448715132, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.625732660293579, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7058130502700806, + "num_tokens": 155274055.0, + "step": 5996 + }, + { + "epoch": 0.6585767625741269, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2877635955810547, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7025809288024902, + "num_tokens": 155299575.0, + "step": 5997 + }, + { + "epoch": 0.6586865802767407, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 2.613731622695923, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7034482955932617, + "num_tokens": 155322020.0, + "step": 5998 + }, + { + "epoch": 0.6587963979793543, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.20837140083313, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.690131664276123, + "num_tokens": 155351256.0, + "step": 5999 + }, + { + "epoch": 0.6589062156819679, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.270277500152588, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6876546740531921, + "num_tokens": 155380192.0, + "step": 6000 + }, + { + "epoch": 0.6590160333845816, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.3864552974700928, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6950902938842773, + "num_tokens": 155404549.0, + "step": 6001 + }, + { + "epoch": 0.6591258510871952, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.268794536590576, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.699607789516449, + "num_tokens": 155432136.0, + "step": 6002 + }, + { + "epoch": 0.6592356687898089, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.248445987701416, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6979718208312988, + "num_tokens": 155462102.0, + "step": 6003 + }, + { + "epoch": 0.6593454864924225, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.39638614654541, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7027074098587036, + "num_tokens": 155486045.0, + "step": 6004 + }, + { + "epoch": 0.6594553041950363, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.4901046752929688, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6962785720825195, + "num_tokens": 155510272.0, + "step": 6005 + }, + { + "epoch": 0.6595651218976499, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.046241044998169, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7214475870132446, + "num_tokens": 155541765.0, + "step": 6006 + }, + { + "epoch": 0.6596749396002636, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.048476219177246, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7009687423706055, + "num_tokens": 155574579.0, + "step": 6007 + }, + { + "epoch": 0.6597847573028772, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.1610913276672363, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6909335851669312, + "num_tokens": 155603237.0, + "step": 6008 + }, + { + "epoch": 0.6598945750054909, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.6293323040008545, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.705742597579956, + "num_tokens": 155623349.0, + "step": 6009 + }, + { + "epoch": 0.6600043927081045, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2632834911346436, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6822889447212219, + "num_tokens": 155651660.0, + "step": 6010 + }, + { + "epoch": 0.6601142104107182, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.3038666248321533, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.699173092842102, + "num_tokens": 155676112.0, + "step": 6011 + }, + { + "epoch": 0.6602240281133319, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.071998357772827, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7194114923477173, + "num_tokens": 155706650.0, + "step": 6012 + }, + { + "epoch": 0.6603338458159456, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.0715978145599365, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7036747336387634, + "num_tokens": 155742884.0, + "step": 6013 + }, + { + "epoch": 0.6604436635185592, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2419815063476562, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7260169386863708, + "num_tokens": 155769598.0, + "step": 6014 + }, + { + "epoch": 0.6605534812211729, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.4311630725860596, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6949719190597534, + "num_tokens": 155792342.0, + "step": 6015 + }, + { + "epoch": 0.6606632989237865, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2159860134124756, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7035943865776062, + "num_tokens": 155820917.0, + "step": 6016 + }, + { + "epoch": 0.6607731166264001, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.262354850769043, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7138886451721191, + "num_tokens": 155846192.0, + "step": 6017 + }, + { + "epoch": 0.6608829343290138, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.474653482437134, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6900638937950134, + "num_tokens": 155869072.0, + "step": 6018 + }, + { + "epoch": 0.6609927520316276, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.38008975982666, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6905622482299805, + "num_tokens": 155898142.0, + "step": 6019 + }, + { + "epoch": 0.6611025697342412, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2611188888549805, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7165517210960388, + "num_tokens": 155925699.0, + "step": 6020 + }, + { + "epoch": 0.6612123874368548, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.417097330093384, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6901257634162903, + "num_tokens": 155950313.0, + "step": 6021 + }, + { + "epoch": 0.6613222051394685, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2168776988983154, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7161439657211304, + "num_tokens": 155977689.0, + "step": 6022 + }, + { + "epoch": 0.6614320228420821, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.385251522064209, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7038102149963379, + "num_tokens": 156003254.0, + "step": 6023 + }, + { + "epoch": 0.6615418405446958, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.449430465698242, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7164953947067261, + "num_tokens": 156027004.0, + "step": 6024 + }, + { + "epoch": 0.6616516582473094, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.272526741027832, + "learning_rate": 1e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6801483631134033, + "num_tokens": 156057555.0, + "step": 6025 + }, + { + "epoch": 0.6617614759499231, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.3718619346618652, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6916377544403076, + "num_tokens": 156083938.0, + "step": 6026 + }, + { + "epoch": 0.6618712936525368, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.130718946456909, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7044729590415955, + "num_tokens": 156115291.0, + "step": 6027 + }, + { + "epoch": 0.6619811113551505, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2469189167022705, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7146327495574951, + "num_tokens": 156141710.0, + "step": 6028 + }, + { + "epoch": 0.6620909290577641, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.3539538383483887, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6966360807418823, + "num_tokens": 156167447.0, + "step": 6029 + }, + { + "epoch": 0.6622007467603778, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.19315767288208, + "learning_rate": 1e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.6804431676864624, + "num_tokens": 156197415.0, + "step": 6030 + }, + { + "epoch": 0.6623105644629914, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.8862411975860596, + "learning_rate": 1e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.6766330003738403, + "num_tokens": 156235692.0, + "step": 6031 + }, + { + "epoch": 0.6624203821656051, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2216172218322754, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7130552530288696, + "num_tokens": 156263420.0, + "step": 6032 + }, + { + "epoch": 0.6625301998682187, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.493541955947876, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7193688154220581, + "num_tokens": 156285855.0, + "step": 6033 + }, + { + "epoch": 0.6626400175708325, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2094528675079346, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6989177465438843, + "num_tokens": 156315182.0, + "step": 6034 + }, + { + "epoch": 0.6627498352734461, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.294511079788208, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7135984897613525, + "num_tokens": 156340739.0, + "step": 6035 + }, + { + "epoch": 0.6628596529760598, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.303657054901123, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6954828500747681, + "num_tokens": 156367369.0, + "step": 6036 + }, + { + "epoch": 0.6629694706786734, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.4454634189605713, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6907445192337036, + "num_tokens": 156393412.0, + "step": 6037 + }, + { + "epoch": 0.663079288381287, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.3958194255828857, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7079607248306274, + "num_tokens": 156419102.0, + "step": 6038 + }, + { + "epoch": 0.6631891060839007, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2699615955352783, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6920968294143677, + "num_tokens": 156446862.0, + "step": 6039 + }, + { + "epoch": 0.6632989237865143, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.48496150970459, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7047039270401001, + "num_tokens": 156471500.0, + "step": 6040 + }, + { + "epoch": 0.6634087414891281, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 2.2812557220458984, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7054591774940491, + "num_tokens": 156497443.0, + "step": 6041 + }, + { + "epoch": 0.6635185591917417, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.191479206085205, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6875910758972168, + "num_tokens": 156527999.0, + "step": 6042 + }, + { + "epoch": 0.6636283768943554, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.423264503479004, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6971476078033447, + "num_tokens": 156551836.0, + "step": 6043 + }, + { + "epoch": 0.663738194596969, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.226656436920166, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6842783689498901, + "num_tokens": 156580317.0, + "step": 6044 + }, + { + "epoch": 0.6638480122995827, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.4476449489593506, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7126334309577942, + "num_tokens": 156603323.0, + "step": 6045 + }, + { + "epoch": 0.6639578300021963, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.217794179916382, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7016900777816772, + "num_tokens": 156630293.0, + "step": 6046 + }, + { + "epoch": 0.66406764770481, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 2.3053667545318604, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.688392162322998, + "num_tokens": 156658155.0, + "step": 6047 + }, + { + "epoch": 0.6641774654074237, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2942519187927246, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6939442157745361, + "num_tokens": 156685519.0, + "step": 6048 + }, + { + "epoch": 0.6642872831100374, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2014899253845215, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7117195129394531, + "num_tokens": 156712257.0, + "step": 6049 + }, + { + "epoch": 0.664397100812651, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4127261638641357, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7009443640708923, + "num_tokens": 156735594.0, + "step": 6050 + }, + { + "epoch": 0.6645069185152647, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.35121488571167, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.696987509727478, + "num_tokens": 156760714.0, + "step": 6051 + }, + { + "epoch": 0.6646167362178783, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1696417331695557, + "learning_rate": 1e-06, + "loss": 1.1351, + "mean_token_accuracy": 0.6773486733436584, + "num_tokens": 156791882.0, + "step": 6052 + }, + { + "epoch": 0.664726553920492, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4381909370422363, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6951225996017456, + "num_tokens": 156816170.0, + "step": 6053 + }, + { + "epoch": 0.6648363716231056, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4329047203063965, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7329813838005066, + "num_tokens": 156838617.0, + "step": 6054 + }, + { + "epoch": 0.6649461893257194, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.176846504211426, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6850184202194214, + "num_tokens": 156870103.0, + "step": 6055 + }, + { + "epoch": 0.665056007028333, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4545276165008545, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7054659128189087, + "num_tokens": 156895180.0, + "step": 6056 + }, + { + "epoch": 0.6651658247309467, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3861048221588135, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7031126022338867, + "num_tokens": 156919471.0, + "step": 6057 + }, + { + "epoch": 0.6652756424335603, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.329444169998169, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.7048130035400391, + "num_tokens": 156947861.0, + "step": 6058 + }, + { + "epoch": 0.665385460136174, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.682366132736206, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7189821004867554, + "num_tokens": 156966379.0, + "step": 6059 + }, + { + "epoch": 0.6654952778387876, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.448489189147949, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7353618144989014, + "num_tokens": 156988824.0, + "step": 6060 + }, + { + "epoch": 0.6656050955414012, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4307475090026855, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6918480396270752, + "num_tokens": 157012835.0, + "step": 6061 + }, + { + "epoch": 0.6657149132440149, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4141812324523926, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7115074396133423, + "num_tokens": 157041090.0, + "step": 6062 + }, + { + "epoch": 0.6658247309466286, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3043203353881836, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.687113881111145, + "num_tokens": 157069696.0, + "step": 6063 + }, + { + "epoch": 0.6659345486492423, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.6358261108398438, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.735327422618866, + "num_tokens": 157088566.0, + "step": 6064 + }, + { + "epoch": 0.6660443663518559, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2776801586151123, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7052170634269714, + "num_tokens": 157114916.0, + "step": 6065 + }, + { + "epoch": 0.6661541840544696, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2172958850860596, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6768520474433899, + "num_tokens": 157146212.0, + "step": 6066 + }, + { + "epoch": 0.6662640017570832, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 3.1686036586761475, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6998440027236938, + "num_tokens": 157168951.0, + "step": 6067 + }, + { + "epoch": 0.6663738194596969, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.6766412258148193, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7080004215240479, + "num_tokens": 157189670.0, + "step": 6068 + }, + { + "epoch": 0.6664836371623105, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.686462163925171, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7068322896957397, + "num_tokens": 157214174.0, + "step": 6069 + }, + { + "epoch": 0.6665934548649243, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.246685743331909, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6839552521705627, + "num_tokens": 157242797.0, + "step": 6070 + }, + { + "epoch": 0.6667032725675379, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.739949941635132, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7015822529792786, + "num_tokens": 157262190.0, + "step": 6071 + }, + { + "epoch": 0.6668130902701516, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5556328296661377, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.713708221912384, + "num_tokens": 157283822.0, + "step": 6072 + }, + { + "epoch": 0.6669229079727652, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3563485145568848, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6921728253364563, + "num_tokens": 157308337.0, + "step": 6073 + }, + { + "epoch": 0.6670327256753789, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.322444200515747, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7085080146789551, + "num_tokens": 157332784.0, + "step": 6074 + }, + { + "epoch": 0.6671425433779925, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.324359178543091, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6856147050857544, + "num_tokens": 157360868.0, + "step": 6075 + }, + { + "epoch": 0.6672523610806061, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.615814447402954, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.694786548614502, + "num_tokens": 157382569.0, + "step": 6076 + }, + { + "epoch": 0.6673621787832199, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2225899696350098, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6913084983825684, + "num_tokens": 157409460.0, + "step": 6077 + }, + { + "epoch": 0.6674719964858336, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2258148193359375, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.6855411529541016, + "num_tokens": 157440052.0, + "step": 6078 + }, + { + "epoch": 0.6675818141884472, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4814505577087402, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6932960748672485, + "num_tokens": 157464038.0, + "step": 6079 + }, + { + "epoch": 0.6676916318910608, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1553359031677246, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6949849128723145, + "num_tokens": 157492927.0, + "step": 6080 + }, + { + "epoch": 0.6678014495936745, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1084070205688477, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7123986482620239, + "num_tokens": 157522518.0, + "step": 6081 + }, + { + "epoch": 0.6679112672962881, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.366506338119507, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7049008011817932, + "num_tokens": 157547170.0, + "step": 6082 + }, + { + "epoch": 0.6680210849989018, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.318998336791992, + "learning_rate": 1e-06, + "loss": 1.1712, + "mean_token_accuracy": 0.667961061000824, + "num_tokens": 157575044.0, + "step": 6083 + }, + { + "epoch": 0.6681309027015155, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 1.9459134340286255, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6846101880073547, + "num_tokens": 157614962.0, + "step": 6084 + }, + { + "epoch": 0.6682407204041292, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5134449005126953, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7013227939605713, + "num_tokens": 157638797.0, + "step": 6085 + }, + { + "epoch": 0.6683505381067428, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3277602195739746, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7015376687049866, + "num_tokens": 157663629.0, + "step": 6086 + }, + { + "epoch": 0.6684603558093565, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.44887638092041, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7186314463615417, + "num_tokens": 157686851.0, + "step": 6087 + }, + { + "epoch": 0.6685701735119701, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.624978542327881, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.720880925655365, + "num_tokens": 157706324.0, + "step": 6088 + }, + { + "epoch": 0.6686799912145838, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5806527137756348, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6914823651313782, + "num_tokens": 157730442.0, + "step": 6089 + }, + { + "epoch": 0.6687898089171974, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.624256134033203, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7185589075088501, + "num_tokens": 157753769.0, + "step": 6090 + }, + { + "epoch": 0.6688996266198111, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2480878829956055, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6934686899185181, + "num_tokens": 157783437.0, + "step": 6091 + }, + { + "epoch": 0.6690094443224248, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.24712872505188, + "learning_rate": 1e-06, + "loss": 1.1107, + "mean_token_accuracy": 0.6705291867256165, + "num_tokens": 157813602.0, + "step": 6092 + }, + { + "epoch": 0.6691192620250385, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1796677112579346, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7100933790206909, + "num_tokens": 157841665.0, + "step": 6093 + }, + { + "epoch": 0.6692290797276521, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5053751468658447, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7018163204193115, + "num_tokens": 157864011.0, + "step": 6094 + }, + { + "epoch": 0.6693388974302658, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1549510955810547, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7063397169113159, + "num_tokens": 157893302.0, + "step": 6095 + }, + { + "epoch": 0.6694487151328794, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4174208641052246, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7199734449386597, + "num_tokens": 157917252.0, + "step": 6096 + }, + { + "epoch": 0.669558532835493, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4075124263763428, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7037627696990967, + "num_tokens": 157946229.0, + "step": 6097 + }, + { + "epoch": 0.6696683505381067, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.394728660583496, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6868987679481506, + "num_tokens": 157971749.0, + "step": 6098 + }, + { + "epoch": 0.6697781682407205, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1183676719665527, + "learning_rate": 1e-06, + "loss": 1.09, + "mean_token_accuracy": 0.6809908747673035, + "num_tokens": 158004797.0, + "step": 6099 + }, + { + "epoch": 0.6698879859433341, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2578907012939453, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.7002416849136353, + "num_tokens": 158032561.0, + "step": 6100 + }, + { + "epoch": 0.6699978036459477, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.520803689956665, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7257580757141113, + "num_tokens": 158054892.0, + "step": 6101 + }, + { + "epoch": 0.6701076213485614, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3443408012390137, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6875635385513306, + "num_tokens": 158083531.0, + "step": 6102 + }, + { + "epoch": 0.670217439051175, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.622495174407959, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6907867193222046, + "num_tokens": 158105142.0, + "step": 6103 + }, + { + "epoch": 0.6703272567537887, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.7611804008483887, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.702831506729126, + "num_tokens": 158124184.0, + "step": 6104 + }, + { + "epoch": 0.6704370744564023, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.576197862625122, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7163255214691162, + "num_tokens": 158144656.0, + "step": 6105 + }, + { + "epoch": 0.6705468921590161, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5259945392608643, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7040879130363464, + "num_tokens": 158168309.0, + "step": 6106 + }, + { + "epoch": 0.6706567098616297, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.6172633171081543, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7171736359596252, + "num_tokens": 158189567.0, + "step": 6107 + }, + { + "epoch": 0.6707665275642434, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5188229084014893, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7075262069702148, + "num_tokens": 158210476.0, + "step": 6108 + }, + { + "epoch": 0.670876345266857, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.295682668685913, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7183870077133179, + "num_tokens": 158237155.0, + "step": 6109 + }, + { + "epoch": 0.6709861629694707, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5567798614501953, + "learning_rate": 1e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6871693134307861, + "num_tokens": 158260792.0, + "step": 6110 + }, + { + "epoch": 0.6710959806720843, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5723133087158203, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7202185392379761, + "num_tokens": 158280874.0, + "step": 6111 + }, + { + "epoch": 0.671205798374698, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3564090728759766, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7178206443786621, + "num_tokens": 158304659.0, + "step": 6112 + }, + { + "epoch": 0.6713156160773117, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1743886470794678, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.6833834648132324, + "num_tokens": 158334472.0, + "step": 6113 + }, + { + "epoch": 0.6714254337799254, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.127769947052002, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6855950355529785, + "num_tokens": 158367217.0, + "step": 6114 + }, + { + "epoch": 0.671535251482539, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1803762912750244, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6949470043182373, + "num_tokens": 158396304.0, + "step": 6115 + }, + { + "epoch": 0.6716450691851527, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2878427505493164, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6910341382026672, + "num_tokens": 158422496.0, + "step": 6116 + }, + { + "epoch": 0.6717548868877663, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.424755096435547, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7016680240631104, + "num_tokens": 158445506.0, + "step": 6117 + }, + { + "epoch": 0.67186470459038, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1371967792510986, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7104906439781189, + "num_tokens": 158473721.0, + "step": 6118 + }, + { + "epoch": 0.6719745222929936, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4402990341186523, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7119129300117493, + "num_tokens": 158496941.0, + "step": 6119 + }, + { + "epoch": 0.6720843399956072, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.145845413208008, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7189560532569885, + "num_tokens": 158525505.0, + "step": 6120 + }, + { + "epoch": 0.672194157698221, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5781030654907227, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7023247480392456, + "num_tokens": 158548342.0, + "step": 6121 + }, + { + "epoch": 0.6723039754008346, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2309117317199707, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6914514303207397, + "num_tokens": 158577693.0, + "step": 6122 + }, + { + "epoch": 0.6724137931034483, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3564670085906982, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7154932022094727, + "num_tokens": 158602480.0, + "step": 6123 + }, + { + "epoch": 0.6725236108060619, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.217095136642456, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7018316984176636, + "num_tokens": 158631339.0, + "step": 6124 + }, + { + "epoch": 0.6726334285086756, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.7165699005126953, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7065882682800293, + "num_tokens": 158651126.0, + "step": 6125 + }, + { + "epoch": 0.6727432462112892, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4998650550842285, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7171446084976196, + "num_tokens": 158674262.0, + "step": 6126 + }, + { + "epoch": 0.6728530639139029, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.6328954696655273, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6939812302589417, + "num_tokens": 158695067.0, + "step": 6127 + }, + { + "epoch": 0.6729628816165166, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.556959390640259, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7273524403572083, + "num_tokens": 158716763.0, + "step": 6128 + }, + { + "epoch": 0.6730726993191303, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.260751247406006, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7020151615142822, + "num_tokens": 158743441.0, + "step": 6129 + }, + { + "epoch": 0.6731825170217439, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.394984722137451, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7107075452804565, + "num_tokens": 158768551.0, + "step": 6130 + }, + { + "epoch": 0.6732923347243576, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4602246284484863, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6989655494689941, + "num_tokens": 158793558.0, + "step": 6131 + }, + { + "epoch": 0.6734021524269712, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3021316528320312, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7276023626327515, + "num_tokens": 158818430.0, + "step": 6132 + }, + { + "epoch": 0.6735119701295849, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5046255588531494, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6911609172821045, + "num_tokens": 158843138.0, + "step": 6133 + }, + { + "epoch": 0.6736217878321985, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1588358879089355, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.7036612033843994, + "num_tokens": 158874477.0, + "step": 6134 + }, + { + "epoch": 0.6737316055348123, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4152286052703857, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7071511745452881, + "num_tokens": 158899004.0, + "step": 6135 + }, + { + "epoch": 0.6738414232374259, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3590338230133057, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.702223539352417, + "num_tokens": 158924939.0, + "step": 6136 + }, + { + "epoch": 0.6739512409400396, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.296395778656006, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.715262770652771, + "num_tokens": 158950178.0, + "step": 6137 + }, + { + "epoch": 0.6740610586426532, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.163665294647217, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6985684633255005, + "num_tokens": 158979755.0, + "step": 6138 + }, + { + "epoch": 0.6741708763452668, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3042449951171875, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7037129402160645, + "num_tokens": 159007944.0, + "step": 6139 + }, + { + "epoch": 0.6742806940478805, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.070392370223999, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7167022228240967, + "num_tokens": 159037619.0, + "step": 6140 + }, + { + "epoch": 0.6743905117504941, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2177183628082275, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7122907042503357, + "num_tokens": 159065207.0, + "step": 6141 + }, + { + "epoch": 0.6745003294531079, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.288977861404419, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.696431040763855, + "num_tokens": 159091637.0, + "step": 6142 + }, + { + "epoch": 0.6746101471557215, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5391037464141846, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.707785964012146, + "num_tokens": 159114372.0, + "step": 6143 + }, + { + "epoch": 0.6747199648583352, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.425826072692871, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6922541856765747, + "num_tokens": 159141680.0, + "step": 6144 + }, + { + "epoch": 0.6748297825609488, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.642636775970459, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.712907075881958, + "num_tokens": 159162816.0, + "step": 6145 + }, + { + "epoch": 0.6749396002635625, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3319334983825684, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6965380907058716, + "num_tokens": 159188403.0, + "step": 6146 + }, + { + "epoch": 0.6750494179661761, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2153539657592773, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6958321332931519, + "num_tokens": 159218711.0, + "step": 6147 + }, + { + "epoch": 0.6751592356687898, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2675118446350098, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.686164379119873, + "num_tokens": 159246877.0, + "step": 6148 + }, + { + "epoch": 0.6752690533714034, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1518876552581787, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7074851393699646, + "num_tokens": 159275042.0, + "step": 6149 + }, + { + "epoch": 0.6753788710740172, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 1.9526286125183105, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7123849987983704, + "num_tokens": 159307958.0, + "step": 6150 + }, + { + "epoch": 0.6754886887766308, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5233359336853027, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7025380730628967, + "num_tokens": 159330280.0, + "step": 6151 + }, + { + "epoch": 0.6755985064792445, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.225471258163452, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6996191143989563, + "num_tokens": 159358692.0, + "step": 6152 + }, + { + "epoch": 0.6757083241818581, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.099252223968506, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7059266567230225, + "num_tokens": 159387486.0, + "step": 6153 + }, + { + "epoch": 0.6758181418844718, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4181203842163086, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7074529528617859, + "num_tokens": 159411417.0, + "step": 6154 + }, + { + "epoch": 0.6759279595870854, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.125642776489258, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.708191454410553, + "num_tokens": 159442444.0, + "step": 6155 + }, + { + "epoch": 0.676037777289699, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.391458511352539, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6954641938209534, + "num_tokens": 159466545.0, + "step": 6156 + }, + { + "epoch": 0.6761475949923128, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.411206007003784, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6957739591598511, + "num_tokens": 159491951.0, + "step": 6157 + }, + { + "epoch": 0.6762574126949265, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.117781639099121, + "learning_rate": 1e-06, + "loss": 1.1455, + "mean_token_accuracy": 0.6713597774505615, + "num_tokens": 159524107.0, + "step": 6158 + }, + { + "epoch": 0.6763672303975401, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.004818916320801, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.710699737071991, + "num_tokens": 159560268.0, + "step": 6159 + }, + { + "epoch": 0.6764770481001537, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4316563606262207, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7375395894050598, + "num_tokens": 159582247.0, + "step": 6160 + }, + { + "epoch": 0.6765868658027674, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3790900707244873, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7096679210662842, + "num_tokens": 159608502.0, + "step": 6161 + }, + { + "epoch": 0.676696683505381, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.574950695037842, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7191166281700134, + "num_tokens": 159629373.0, + "step": 6162 + }, + { + "epoch": 0.6768065012079947, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.412989854812622, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6902379989624023, + "num_tokens": 159654888.0, + "step": 6163 + }, + { + "epoch": 0.6769163189106084, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.379944324493408, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7120332717895508, + "num_tokens": 159678909.0, + "step": 6164 + }, + { + "epoch": 0.6770261366132221, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.711968183517456, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7133170366287231, + "num_tokens": 159699296.0, + "step": 6165 + }, + { + "epoch": 0.6771359543158357, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2237589359283447, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6912346482276917, + "num_tokens": 159728051.0, + "step": 6166 + }, + { + "epoch": 0.6772457720184494, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3695807456970215, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7168680429458618, + "num_tokens": 159752609.0, + "step": 6167 + }, + { + "epoch": 0.677355589721063, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.583012580871582, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6962926387786865, + "num_tokens": 159775222.0, + "step": 6168 + }, + { + "epoch": 0.6774654074236767, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.589137315750122, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6941395998001099, + "num_tokens": 159795594.0, + "step": 6169 + }, + { + "epoch": 0.6775752251262903, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3484723567962646, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6981198787689209, + "num_tokens": 159821063.0, + "step": 6170 + }, + { + "epoch": 0.6776850428289041, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1934871673583984, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.709602415561676, + "num_tokens": 159850148.0, + "step": 6171 + }, + { + "epoch": 0.6777948605315177, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.407435655593872, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7130734920501709, + "num_tokens": 159875384.0, + "step": 6172 + }, + { + "epoch": 0.6779046782341314, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.408540964126587, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.705965518951416, + "num_tokens": 159898946.0, + "step": 6173 + }, + { + "epoch": 0.678014495936745, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4036033153533936, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6805851459503174, + "num_tokens": 159925065.0, + "step": 6174 + }, + { + "epoch": 0.6781243136393587, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.376211166381836, + "learning_rate": 1e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6811869144439697, + "num_tokens": 159950968.0, + "step": 6175 + }, + { + "epoch": 0.6782341313419723, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5005877017974854, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6937528252601624, + "num_tokens": 159972968.0, + "step": 6176 + }, + { + "epoch": 0.678343949044586, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.157046318054199, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.739200234413147, + "num_tokens": 160000824.0, + "step": 6177 + }, + { + "epoch": 0.6784537667471996, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.396350145339966, + "learning_rate": 1e-06, + "loss": 1.0807, + "mean_token_accuracy": 0.6822456121444702, + "num_tokens": 160026215.0, + "step": 6178 + }, + { + "epoch": 0.6785635844498134, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4265642166137695, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7013574242591858, + "num_tokens": 160048907.0, + "step": 6179 + }, + { + "epoch": 0.678673402152427, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.750237226486206, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6913881301879883, + "num_tokens": 160078943.0, + "step": 6180 + }, + { + "epoch": 0.6787832198550406, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2951338291168213, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7074392437934875, + "num_tokens": 160105582.0, + "step": 6181 + }, + { + "epoch": 0.6788930375576543, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3299596309661865, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7035460472106934, + "num_tokens": 160128859.0, + "step": 6182 + }, + { + "epoch": 0.6790028552602679, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1386513710021973, + "learning_rate": 1e-06, + "loss": 1.1417, + "mean_token_accuracy": 0.6680209636688232, + "num_tokens": 160161771.0, + "step": 6183 + }, + { + "epoch": 0.6791126729628816, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4666287899017334, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7117679119110107, + "num_tokens": 160185315.0, + "step": 6184 + }, + { + "epoch": 0.6792224906654952, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3101015090942383, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7310008406639099, + "num_tokens": 160210093.0, + "step": 6185 + }, + { + "epoch": 0.679332308368109, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.498279094696045, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.696735680103302, + "num_tokens": 160235947.0, + "step": 6186 + }, + { + "epoch": 0.6794421260707226, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.278054714202881, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7113999128341675, + "num_tokens": 160262839.0, + "step": 6187 + }, + { + "epoch": 0.6795519437733363, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4042460918426514, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.70069819688797, + "num_tokens": 160291066.0, + "step": 6188 + }, + { + "epoch": 0.6796617614759499, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.371950149536133, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6946078538894653, + "num_tokens": 160316101.0, + "step": 6189 + }, + { + "epoch": 0.6797715791785636, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1817798614501953, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7198305130004883, + "num_tokens": 160342531.0, + "step": 6190 + }, + { + "epoch": 0.6798813968811772, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.71573805809021, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7052145004272461, + "num_tokens": 160361907.0, + "step": 6191 + }, + { + "epoch": 0.6799912145837909, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.410874843597412, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7252298593521118, + "num_tokens": 160385012.0, + "step": 6192 + }, + { + "epoch": 0.6801010322864046, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.0611746311187744, + "learning_rate": 1e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.6756383180618286, + "num_tokens": 160418417.0, + "step": 6193 + }, + { + "epoch": 0.6802108499890183, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.7465527057647705, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7178552150726318, + "num_tokens": 160436585.0, + "step": 6194 + }, + { + "epoch": 0.6803206676916319, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.199281930923462, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6938612461090088, + "num_tokens": 160465543.0, + "step": 6195 + }, + { + "epoch": 0.6804304853942456, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.530308485031128, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6886497735977173, + "num_tokens": 160494434.0, + "step": 6196 + }, + { + "epoch": 0.6805403030968592, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1019279956817627, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6963501572608948, + "num_tokens": 160522554.0, + "step": 6197 + }, + { + "epoch": 0.6806501207994728, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.549523115158081, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7011808156967163, + "num_tokens": 160545157.0, + "step": 6198 + }, + { + "epoch": 0.6807599385020865, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1499507427215576, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6916075944900513, + "num_tokens": 160573827.0, + "step": 6199 + }, + { + "epoch": 0.6808697562047002, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.658738613128662, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7195999026298523, + "num_tokens": 160592697.0, + "step": 6200 + }, + { + "epoch": 0.6809795739073139, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2136964797973633, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7048541307449341, + "num_tokens": 160621152.0, + "step": 6201 + }, + { + "epoch": 0.6810893916099275, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.2181589603424072, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7044081687927246, + "num_tokens": 160649670.0, + "step": 6202 + }, + { + "epoch": 0.6811992093125412, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5857717990875244, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7017459273338318, + "num_tokens": 160672004.0, + "step": 6203 + }, + { + "epoch": 0.6813090270151548, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.19569730758667, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6897045373916626, + "num_tokens": 160701671.0, + "step": 6204 + }, + { + "epoch": 0.6814188447177685, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.602717161178589, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6900674700737, + "num_tokens": 160724240.0, + "step": 6205 + }, + { + "epoch": 0.6815286624203821, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.3169827461242676, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6927723288536072, + "num_tokens": 160750043.0, + "step": 6206 + }, + { + "epoch": 0.6816384801229959, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.474396228790283, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6953732371330261, + "num_tokens": 160775094.0, + "step": 6207 + }, + { + "epoch": 0.6817482978256095, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1617279052734375, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7304123640060425, + "num_tokens": 160804642.0, + "step": 6208 + }, + { + "epoch": 0.6818581155282232, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.369274616241455, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.71183180809021, + "num_tokens": 160828357.0, + "step": 6209 + }, + { + "epoch": 0.6819679332308368, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.451687812805176, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7250443696975708, + "num_tokens": 160852946.0, + "step": 6210 + }, + { + "epoch": 0.6820777509334505, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3599350452423096, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6852738857269287, + "num_tokens": 160880009.0, + "step": 6211 + }, + { + "epoch": 0.6821875686360641, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1650989055633545, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7059024572372437, + "num_tokens": 160910735.0, + "step": 6212 + }, + { + "epoch": 0.6822973863386778, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.5139455795288086, + "learning_rate": 1e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6771224141120911, + "num_tokens": 160940187.0, + "step": 6213 + }, + { + "epoch": 0.6824072040412914, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.1883206367492676, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7129141092300415, + "num_tokens": 160967733.0, + "step": 6214 + }, + { + "epoch": 0.6825170217439052, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.4616026878356934, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.711683988571167, + "num_tokens": 160991485.0, + "step": 6215 + }, + { + "epoch": 0.6826268394465188, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.343296766281128, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6933646202087402, + "num_tokens": 161022484.0, + "step": 6216 + }, + { + "epoch": 0.6827366571491325, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.7828943729400635, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7062987089157104, + "num_tokens": 161041677.0, + "step": 6217 + }, + { + "epoch": 0.6828464748517461, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 2.105210065841675, + "learning_rate": 1e-06, + "loss": 1.1331, + "mean_token_accuracy": 0.6794458031654358, + "num_tokens": 161078083.0, + "step": 6218 + }, + { + "epoch": 0.6829562925543597, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2214303016662598, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7065092325210571, + "num_tokens": 161105164.0, + "step": 6219 + }, + { + "epoch": 0.6830661102569734, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2843143939971924, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6973174214363098, + "num_tokens": 161134797.0, + "step": 6220 + }, + { + "epoch": 0.683175927959587, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1592800617218018, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.709213137626648, + "num_tokens": 161163605.0, + "step": 6221 + }, + { + "epoch": 0.6832857456622008, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.314678192138672, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7050970196723938, + "num_tokens": 161189308.0, + "step": 6222 + }, + { + "epoch": 0.6833955633648144, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4260799884796143, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7044310569763184, + "num_tokens": 161212337.0, + "step": 6223 + }, + { + "epoch": 0.6835053810674281, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.464160442352295, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7060234546661377, + "num_tokens": 161234858.0, + "step": 6224 + }, + { + "epoch": 0.6836151987700417, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2365357875823975, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7083151340484619, + "num_tokens": 161261948.0, + "step": 6225 + }, + { + "epoch": 0.6837250164726554, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4078800678253174, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7063108682632446, + "num_tokens": 161285655.0, + "step": 6226 + }, + { + "epoch": 0.683834834175269, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3080544471740723, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7054432034492493, + "num_tokens": 161313042.0, + "step": 6227 + }, + { + "epoch": 0.6839446518778827, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4579098224639893, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6964450478553772, + "num_tokens": 161339421.0, + "step": 6228 + }, + { + "epoch": 0.6840544695804964, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.0386738777160645, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.684217631816864, + "num_tokens": 161374361.0, + "step": 6229 + }, + { + "epoch": 0.6841642872831101, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3105218410491943, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7039801478385925, + "num_tokens": 161400878.0, + "step": 6230 + }, + { + "epoch": 0.6842741049857237, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2753381729125977, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.7045875787734985, + "num_tokens": 161428464.0, + "step": 6231 + }, + { + "epoch": 0.6843839226883374, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4692742824554443, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7282046675682068, + "num_tokens": 161450688.0, + "step": 6232 + }, + { + "epoch": 0.684493740390951, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.457585573196411, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7029615640640259, + "num_tokens": 161472646.0, + "step": 6233 + }, + { + "epoch": 0.6846035580935647, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.505955934524536, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6952494978904724, + "num_tokens": 161495999.0, + "step": 6234 + }, + { + "epoch": 0.6847133757961783, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.37870192527771, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7174934148788452, + "num_tokens": 161518854.0, + "step": 6235 + }, + { + "epoch": 0.6848231934987921, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5415825843811035, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6968388557434082, + "num_tokens": 161541927.0, + "step": 6236 + }, + { + "epoch": 0.6849330112014057, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.365325689315796, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7046916484832764, + "num_tokens": 161568560.0, + "step": 6237 + }, + { + "epoch": 0.6850428289040194, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5702872276306152, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7039211988449097, + "num_tokens": 161592384.0, + "step": 6238 + }, + { + "epoch": 0.685152646606633, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.351468086242676, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6875877380371094, + "num_tokens": 161620897.0, + "step": 6239 + }, + { + "epoch": 0.6852624643092466, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.541977882385254, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7149120569229126, + "num_tokens": 161644189.0, + "step": 6240 + }, + { + "epoch": 0.6853722820118603, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4105329513549805, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6902251839637756, + "num_tokens": 161668350.0, + "step": 6241 + }, + { + "epoch": 0.6854820997144739, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.539295196533203, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7187029123306274, + "num_tokens": 161691163.0, + "step": 6242 + }, + { + "epoch": 0.6855919174170876, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.377859115600586, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7184785008430481, + "num_tokens": 161715801.0, + "step": 6243 + }, + { + "epoch": 0.6857017351197013, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4131383895874023, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6901113390922546, + "num_tokens": 161743236.0, + "step": 6244 + }, + { + "epoch": 0.685811552822315, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.172957420349121, + "learning_rate": 1e-06, + "loss": 1.1283, + "mean_token_accuracy": 0.6742488741874695, + "num_tokens": 161772576.0, + "step": 6245 + }, + { + "epoch": 0.6859213705249286, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.306394577026367, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7059564590454102, + "num_tokens": 161798925.0, + "step": 6246 + }, + { + "epoch": 0.6860311882275423, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.705918788909912, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7094494104385376, + "num_tokens": 161820941.0, + "step": 6247 + }, + { + "epoch": 0.6861410059301559, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.420286178588867, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6936911344528198, + "num_tokens": 161846081.0, + "step": 6248 + }, + { + "epoch": 0.6862508236327696, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.242727279663086, + "learning_rate": 1e-06, + "loss": 1.1414, + "mean_token_accuracy": 0.6685081124305725, + "num_tokens": 161875326.0, + "step": 6249 + }, + { + "epoch": 0.6863606413353832, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2632951736450195, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7056633830070496, + "num_tokens": 161902576.0, + "step": 6250 + }, + { + "epoch": 0.686470459037997, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.77280592918396, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7153596878051758, + "num_tokens": 161920422.0, + "step": 6251 + }, + { + "epoch": 0.6865802767406106, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.428504467010498, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.716312050819397, + "num_tokens": 161943729.0, + "step": 6252 + }, + { + "epoch": 0.6866900944432243, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3320252895355225, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6916355490684509, + "num_tokens": 161970782.0, + "step": 6253 + }, + { + "epoch": 0.6867999121458379, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.0421180725097656, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6881122589111328, + "num_tokens": 162007033.0, + "step": 6254 + }, + { + "epoch": 0.6869097298484516, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.6051650047302246, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7097127437591553, + "num_tokens": 162027881.0, + "step": 6255 + }, + { + "epoch": 0.6870195475510652, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2097744941711426, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6988065838813782, + "num_tokens": 162058289.0, + "step": 6256 + }, + { + "epoch": 0.6871293652536788, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.7744579315185547, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7235305905342102, + "num_tokens": 162079047.0, + "step": 6257 + }, + { + "epoch": 0.6872391829562926, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.53937029838562, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7014380097389221, + "num_tokens": 162100921.0, + "step": 6258 + }, + { + "epoch": 0.6873490006589063, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4413251876831055, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6961045265197754, + "num_tokens": 162125491.0, + "step": 6259 + }, + { + "epoch": 0.6874588183615199, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4015233516693115, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6938287019729614, + "num_tokens": 162150785.0, + "step": 6260 + }, + { + "epoch": 0.6875686360641335, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.509260416030884, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6927273273468018, + "num_tokens": 162173188.0, + "step": 6261 + }, + { + "epoch": 0.6876784537667472, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1303133964538574, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7210367918014526, + "num_tokens": 162203116.0, + "step": 6262 + }, + { + "epoch": 0.6877882714693608, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1852364540100098, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6961401700973511, + "num_tokens": 162230643.0, + "step": 6263 + }, + { + "epoch": 0.6878980891719745, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3314850330352783, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6913566589355469, + "num_tokens": 162259943.0, + "step": 6264 + }, + { + "epoch": 0.6880079068745882, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.367921829223633, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7050682306289673, + "num_tokens": 162284553.0, + "step": 6265 + }, + { + "epoch": 0.6881177245772019, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4625682830810547, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6928062438964844, + "num_tokens": 162309858.0, + "step": 6266 + }, + { + "epoch": 0.6882275422798155, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2781872749328613, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6992433667182922, + "num_tokens": 162337622.0, + "step": 6267 + }, + { + "epoch": 0.6883373599824292, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.226290225982666, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6940053701400757, + "num_tokens": 162366547.0, + "step": 6268 + }, + { + "epoch": 0.6884471776850428, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1935553550720215, + "learning_rate": 1e-06, + "loss": 1.0871, + "mean_token_accuracy": 0.6785640716552734, + "num_tokens": 162396141.0, + "step": 6269 + }, + { + "epoch": 0.6885569953876565, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.380106210708618, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7149048447608948, + "num_tokens": 162420169.0, + "step": 6270 + }, + { + "epoch": 0.6886668130902701, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.435879707336426, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6913043260574341, + "num_tokens": 162445872.0, + "step": 6271 + }, + { + "epoch": 0.6887766307928838, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2676289081573486, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7019847631454468, + "num_tokens": 162473104.0, + "step": 6272 + }, + { + "epoch": 0.6888864484954975, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3596866130828857, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7187548875808716, + "num_tokens": 162498431.0, + "step": 6273 + }, + { + "epoch": 0.6889962661981112, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1531124114990234, + "learning_rate": 1e-06, + "loss": 1.106, + "mean_token_accuracy": 0.6770689487457275, + "num_tokens": 162528782.0, + "step": 6274 + }, + { + "epoch": 0.6891060839007248, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3132729530334473, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6997382640838623, + "num_tokens": 162553110.0, + "step": 6275 + }, + { + "epoch": 0.6892159016033385, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2431719303131104, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6870549321174622, + "num_tokens": 162581838.0, + "step": 6276 + }, + { + "epoch": 0.6893257193059521, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.564645528793335, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6864314675331116, + "num_tokens": 162604965.0, + "step": 6277 + }, + { + "epoch": 0.6894355370085657, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2073097229003906, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7081140279769897, + "num_tokens": 162634386.0, + "step": 6278 + }, + { + "epoch": 0.6895453547111794, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4629979133605957, + "learning_rate": 1e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.6818066835403442, + "num_tokens": 162660520.0, + "step": 6279 + }, + { + "epoch": 0.6896551724137931, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5934994220733643, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7172855138778687, + "num_tokens": 162683482.0, + "step": 6280 + }, + { + "epoch": 0.6897649901164068, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2617061138153076, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7051115036010742, + "num_tokens": 162712341.0, + "step": 6281 + }, + { + "epoch": 0.6898748078190204, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.376568078994751, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7170438766479492, + "num_tokens": 162735854.0, + "step": 6282 + }, + { + "epoch": 0.6899846255216341, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1432607173919678, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.700239896774292, + "num_tokens": 162767452.0, + "step": 6283 + }, + { + "epoch": 0.6900944432242477, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.6241238117218018, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7198553085327148, + "num_tokens": 162788127.0, + "step": 6284 + }, + { + "epoch": 0.6902042609268614, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.210498809814453, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7044198513031006, + "num_tokens": 162817399.0, + "step": 6285 + }, + { + "epoch": 0.690314078629475, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4296443462371826, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6822530031204224, + "num_tokens": 162842073.0, + "step": 6286 + }, + { + "epoch": 0.6904238963320888, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.344449043273926, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7302098274230957, + "num_tokens": 162865066.0, + "step": 6287 + }, + { + "epoch": 0.6905337140347024, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.43761944770813, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7091209888458252, + "num_tokens": 162889656.0, + "step": 6288 + }, + { + "epoch": 0.6906435317373161, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.385164976119995, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7097264528274536, + "num_tokens": 162913759.0, + "step": 6289 + }, + { + "epoch": 0.6907533494399297, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.308748483657837, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7094843983650208, + "num_tokens": 162939871.0, + "step": 6290 + }, + { + "epoch": 0.6908631671425434, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2352755069732666, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.7026780843734741, + "num_tokens": 162968388.0, + "step": 6291 + }, + { + "epoch": 0.690972984845157, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.52531361579895, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7228058576583862, + "num_tokens": 162991016.0, + "step": 6292 + }, + { + "epoch": 0.6910828025477707, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3781535625457764, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7336392402648926, + "num_tokens": 163014896.0, + "step": 6293 + }, + { + "epoch": 0.6911926202503844, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.717231512069702, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7178426384925842, + "num_tokens": 163035705.0, + "step": 6294 + }, + { + "epoch": 0.6913024379529981, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.221367597579956, + "learning_rate": 1e-06, + "loss": 1.0874, + "mean_token_accuracy": 0.6796618700027466, + "num_tokens": 163064708.0, + "step": 6295 + }, + { + "epoch": 0.6914122556556117, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.310955047607422, + "learning_rate": 1e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6755224466323853, + "num_tokens": 163092567.0, + "step": 6296 + }, + { + "epoch": 0.6915220733582254, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3696072101593018, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7132362127304077, + "num_tokens": 163117372.0, + "step": 6297 + }, + { + "epoch": 0.691631891060839, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3758485317230225, + "learning_rate": 1e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.6738479733467102, + "num_tokens": 163142418.0, + "step": 6298 + }, + { + "epoch": 0.6917417087634526, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.212237596511841, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6908270120620728, + "num_tokens": 163172080.0, + "step": 6299 + }, + { + "epoch": 0.6918515264660663, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.324648380279541, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6968227624893188, + "num_tokens": 163197984.0, + "step": 6300 + }, + { + "epoch": 0.6919613441686799, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 7.0424909591674805, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6913765668869019, + "num_tokens": 163226464.0, + "step": 6301 + }, + { + "epoch": 0.6920711618712937, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3067543506622314, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7087074518203735, + "num_tokens": 163256158.0, + "step": 6302 + }, + { + "epoch": 0.6921809795739073, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.546973943710327, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6955187320709229, + "num_tokens": 163277834.0, + "step": 6303 + }, + { + "epoch": 0.692290797276521, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5522098541259766, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7105070948600769, + "num_tokens": 163299028.0, + "step": 6304 + }, + { + "epoch": 0.6924006149791346, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1212611198425293, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6855089664459229, + "num_tokens": 163329095.0, + "step": 6305 + }, + { + "epoch": 0.6925104326817483, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.300520420074463, + "learning_rate": 1e-06, + "loss": 1.1004, + "mean_token_accuracy": 0.6786592602729797, + "num_tokens": 163357278.0, + "step": 6306 + }, + { + "epoch": 0.6926202503843619, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3533921241760254, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7077571153640747, + "num_tokens": 163381779.0, + "step": 6307 + }, + { + "epoch": 0.6927300680869756, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.484283924102783, + "learning_rate": 1e-06, + "loss": 1.127, + "mean_token_accuracy": 0.6733188629150391, + "num_tokens": 163415146.0, + "step": 6308 + }, + { + "epoch": 0.6928398857895893, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3864095211029053, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6911590099334717, + "num_tokens": 163440947.0, + "step": 6309 + }, + { + "epoch": 0.692949703492203, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5955309867858887, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7152054309844971, + "num_tokens": 163460802.0, + "step": 6310 + }, + { + "epoch": 0.6930595211948166, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.991796851158142, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6996256113052368, + "num_tokens": 163496352.0, + "step": 6311 + }, + { + "epoch": 0.6931693388974303, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1287379264831543, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7058132886886597, + "num_tokens": 163526044.0, + "step": 6312 + }, + { + "epoch": 0.6932791566000439, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.582374334335327, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6881004571914673, + "num_tokens": 163550244.0, + "step": 6313 + }, + { + "epoch": 0.6933889743026576, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3702569007873535, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6882553696632385, + "num_tokens": 163576341.0, + "step": 6314 + }, + { + "epoch": 0.6934987920052712, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1894655227661133, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.7002724409103394, + "num_tokens": 163604671.0, + "step": 6315 + }, + { + "epoch": 0.693608609707885, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.448183298110962, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6953389644622803, + "num_tokens": 163629666.0, + "step": 6316 + }, + { + "epoch": 0.6937184274104986, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1418161392211914, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7086501121520996, + "num_tokens": 163658034.0, + "step": 6317 + }, + { + "epoch": 0.6938282451131123, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3633182048797607, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7101010680198669, + "num_tokens": 163683740.0, + "step": 6318 + }, + { + "epoch": 0.6939380628157259, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3682656288146973, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7243748307228088, + "num_tokens": 163706240.0, + "step": 6319 + }, + { + "epoch": 0.6940478805183395, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.7009055614471436, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.717979907989502, + "num_tokens": 163726391.0, + "step": 6320 + }, + { + "epoch": 0.6941576982209532, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3272809982299805, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7125566005706787, + "num_tokens": 163754046.0, + "step": 6321 + }, + { + "epoch": 0.6942675159235668, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3729214668273926, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7023782134056091, + "num_tokens": 163779277.0, + "step": 6322 + }, + { + "epoch": 0.6943773336261806, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1861634254455566, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6839468479156494, + "num_tokens": 163811487.0, + "step": 6323 + }, + { + "epoch": 0.6944871513287942, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.344735860824585, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.694681704044342, + "num_tokens": 163837897.0, + "step": 6324 + }, + { + "epoch": 0.6945969690314079, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.215325355529785, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.689926028251648, + "num_tokens": 163866188.0, + "step": 6325 + }, + { + "epoch": 0.6947067867340215, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2190825939178467, + "learning_rate": 1e-06, + "loss": 1.1308, + "mean_token_accuracy": 0.677734375, + "num_tokens": 163897427.0, + "step": 6326 + }, + { + "epoch": 0.6948166044366352, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2216670513153076, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7170163989067078, + "num_tokens": 163926436.0, + "step": 6327 + }, + { + "epoch": 0.6949264221392488, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2427215576171875, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6852749586105347, + "num_tokens": 163956225.0, + "step": 6328 + }, + { + "epoch": 0.6950362398418625, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3603780269622803, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.7046072483062744, + "num_tokens": 163982151.0, + "step": 6329 + }, + { + "epoch": 0.6951460575444761, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.361588716506958, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6987349987030029, + "num_tokens": 164007634.0, + "step": 6330 + }, + { + "epoch": 0.6952558752470899, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1496691703796387, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7108081579208374, + "num_tokens": 164035131.0, + "step": 6331 + }, + { + "epoch": 0.6953656929497035, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.190124750137329, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6977463364601135, + "num_tokens": 164063187.0, + "step": 6332 + }, + { + "epoch": 0.6954755106523172, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.591362953186035, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7183026671409607, + "num_tokens": 164085148.0, + "step": 6333 + }, + { + "epoch": 0.6955853283549308, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4976038932800293, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.686631441116333, + "num_tokens": 164108696.0, + "step": 6334 + }, + { + "epoch": 0.6956951460575445, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5069048404693604, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7139259576797485, + "num_tokens": 164132496.0, + "step": 6335 + }, + { + "epoch": 0.6958049637601581, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3414018154144287, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7038089036941528, + "num_tokens": 164158020.0, + "step": 6336 + }, + { + "epoch": 0.6959147814627717, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5450804233551025, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7152310609817505, + "num_tokens": 164179998.0, + "step": 6337 + }, + { + "epoch": 0.6960245991653855, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.8671584129333496, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7263579368591309, + "num_tokens": 164197247.0, + "step": 6338 + }, + { + "epoch": 0.6961344168679992, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3685195446014404, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7047242522239685, + "num_tokens": 164222013.0, + "step": 6339 + }, + { + "epoch": 0.6962442345706128, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.4683103561401367, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7183401584625244, + "num_tokens": 164244311.0, + "step": 6340 + }, + { + "epoch": 0.6963540522732264, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.203315496444702, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7013410329818726, + "num_tokens": 164272798.0, + "step": 6341 + }, + { + "epoch": 0.6964638699758401, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3697447776794434, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7036699056625366, + "num_tokens": 164298367.0, + "step": 6342 + }, + { + "epoch": 0.6965736876784537, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.404742479324341, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6825121641159058, + "num_tokens": 164328980.0, + "step": 6343 + }, + { + "epoch": 0.6966835053810674, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3594887256622314, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7005624771118164, + "num_tokens": 164354971.0, + "step": 6344 + }, + { + "epoch": 0.6967933230836811, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.135594606399536, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7071375846862793, + "num_tokens": 164382484.0, + "step": 6345 + }, + { + "epoch": 0.6969031407862948, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.574591875076294, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7117061614990234, + "num_tokens": 164406329.0, + "step": 6346 + }, + { + "epoch": 0.6970129584889084, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.1410117149353027, + "learning_rate": 1e-06, + "loss": 1.1378, + "mean_token_accuracy": 0.6745133996009827, + "num_tokens": 164438435.0, + "step": 6347 + }, + { + "epoch": 0.6971227761915221, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.600846290588379, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6995447278022766, + "num_tokens": 164458132.0, + "step": 6348 + }, + { + "epoch": 0.6972325938941357, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.3124442100524902, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7167599201202393, + "num_tokens": 164483148.0, + "step": 6349 + }, + { + "epoch": 0.6973424115967494, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.314059019088745, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7151007056236267, + "num_tokens": 164507414.0, + "step": 6350 + }, + { + "epoch": 0.697452229299363, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.129103899002075, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.705113410949707, + "num_tokens": 164535855.0, + "step": 6351 + }, + { + "epoch": 0.6975620470019768, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.5110321044921875, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7107174396514893, + "num_tokens": 164558183.0, + "step": 6352 + }, + { + "epoch": 0.6976718647045904, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.2805168628692627, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7065750956535339, + "num_tokens": 164584290.0, + "step": 6353 + }, + { + "epoch": 0.6977816824072041, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.95712947845459, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.711585521697998, + "num_tokens": 164601322.0, + "step": 6354 + }, + { + "epoch": 0.6978915001098177, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.9338319301605225, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7301598191261292, + "num_tokens": 164618813.0, + "step": 6355 + }, + { + "epoch": 0.6980013178124314, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.450263738632202, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7127305269241333, + "num_tokens": 164642036.0, + "step": 6356 + }, + { + "epoch": 0.698111135515045, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1437792778015137, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7093789577484131, + "num_tokens": 164674367.0, + "step": 6357 + }, + { + "epoch": 0.6982209532176586, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4603915214538574, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7093172073364258, + "num_tokens": 164696694.0, + "step": 6358 + }, + { + "epoch": 0.6983307709202724, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.259336471557617, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7280754446983337, + "num_tokens": 164722792.0, + "step": 6359 + }, + { + "epoch": 0.698440588622886, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.400756359100342, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7086561322212219, + "num_tokens": 164748014.0, + "step": 6360 + }, + { + "epoch": 0.6985504063254997, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3763575553894043, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.693518877029419, + "num_tokens": 164772019.0, + "step": 6361 + }, + { + "epoch": 0.6986602240281133, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.276939868927002, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6948041915893555, + "num_tokens": 164800535.0, + "step": 6362 + }, + { + "epoch": 0.698770041730727, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3088924884796143, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6831112504005432, + "num_tokens": 164827307.0, + "step": 6363 + }, + { + "epoch": 0.6988798594333406, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.56362247467041, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7160936594009399, + "num_tokens": 164850156.0, + "step": 6364 + }, + { + "epoch": 0.6989896771359543, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3750665187835693, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7075941562652588, + "num_tokens": 164876107.0, + "step": 6365 + }, + { + "epoch": 0.6990994948385679, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1648924350738525, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6987287998199463, + "num_tokens": 164905536.0, + "step": 6366 + }, + { + "epoch": 0.6992093125411817, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3370652198791504, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7161365747451782, + "num_tokens": 164930711.0, + "step": 6367 + }, + { + "epoch": 0.6993191302437953, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.065816640853882, + "learning_rate": 1e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6750245094299316, + "num_tokens": 164965215.0, + "step": 6368 + }, + { + "epoch": 0.699428947946409, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 7.2868428230285645, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7017542719841003, + "num_tokens": 164987382.0, + "step": 6369 + }, + { + "epoch": 0.6995387656490226, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.438239097595215, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6845758557319641, + "num_tokens": 165016576.0, + "step": 6370 + }, + { + "epoch": 0.6996485833516363, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3671369552612305, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.692314624786377, + "num_tokens": 165043318.0, + "step": 6371 + }, + { + "epoch": 0.6997584010542499, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.6570780277252197, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7000706195831299, + "num_tokens": 165063912.0, + "step": 6372 + }, + { + "epoch": 0.6998682187568636, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3078653812408447, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6980352401733398, + "num_tokens": 165091183.0, + "step": 6373 + }, + { + "epoch": 0.6999780364594773, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.253807783126831, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7182720899581909, + "num_tokens": 165117875.0, + "step": 6374 + }, + { + "epoch": 0.700087854162091, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.616091728210449, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7182483673095703, + "num_tokens": 165139314.0, + "step": 6375 + }, + { + "epoch": 0.7001976718647046, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.413177728652954, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7130739688873291, + "num_tokens": 165164950.0, + "step": 6376 + }, + { + "epoch": 0.7003074895673183, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 8.556148529052734, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.707737386226654, + "num_tokens": 165190399.0, + "step": 6377 + }, + { + "epoch": 0.7004173072699319, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.7458784580230713, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.706113338470459, + "num_tokens": 165213424.0, + "step": 6378 + }, + { + "epoch": 0.7005271249725455, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.378424644470215, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7116768956184387, + "num_tokens": 165239322.0, + "step": 6379 + }, + { + "epoch": 0.7006369426751592, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.034120559692383, + "learning_rate": 1e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6744483113288879, + "num_tokens": 165276415.0, + "step": 6380 + }, + { + "epoch": 0.700746760377773, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2797727584838867, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.684714138507843, + "num_tokens": 165304440.0, + "step": 6381 + }, + { + "epoch": 0.7008565780803866, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3815832138061523, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.713061511516571, + "num_tokens": 165329692.0, + "step": 6382 + }, + { + "epoch": 0.7009663957830002, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.109793186187744, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7066574692726135, + "num_tokens": 165361978.0, + "step": 6383 + }, + { + "epoch": 0.7010762134856139, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1465628147125244, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7128358483314514, + "num_tokens": 165391661.0, + "step": 6384 + }, + { + "epoch": 0.7011860311882275, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.566824197769165, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7041398286819458, + "num_tokens": 165412857.0, + "step": 6385 + }, + { + "epoch": 0.7012958488908412, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.6879374980926514, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7249690294265747, + "num_tokens": 165434553.0, + "step": 6386 + }, + { + "epoch": 0.7014056665934548, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3773438930511475, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.719792902469635, + "num_tokens": 165458779.0, + "step": 6387 + }, + { + "epoch": 0.7015154842960686, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1677684783935547, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7259978652000427, + "num_tokens": 165484387.0, + "step": 6388 + }, + { + "epoch": 0.7016253019986822, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3359580039978027, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7087705135345459, + "num_tokens": 165508058.0, + "step": 6389 + }, + { + "epoch": 0.7017351197012959, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.282386064529419, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6945606470108032, + "num_tokens": 165534481.0, + "step": 6390 + }, + { + "epoch": 0.7018449374039095, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.198559522628784, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7035942077636719, + "num_tokens": 165562167.0, + "step": 6391 + }, + { + "epoch": 0.7019547551065232, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.759295701980591, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7257612347602844, + "num_tokens": 165580369.0, + "step": 6392 + }, + { + "epoch": 0.7020645728091368, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.395282030105591, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6996757984161377, + "num_tokens": 165606836.0, + "step": 6393 + }, + { + "epoch": 0.7021743905117505, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.6191561222076416, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7020217180252075, + "num_tokens": 165629360.0, + "step": 6394 + }, + { + "epoch": 0.7022842082143641, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4256491661071777, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6911827325820923, + "num_tokens": 165653418.0, + "step": 6395 + }, + { + "epoch": 0.7023940259169779, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2545907497406006, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7309366464614868, + "num_tokens": 165678775.0, + "step": 6396 + }, + { + "epoch": 0.7025038436195915, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.317143440246582, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7086033821105957, + "num_tokens": 165704460.0, + "step": 6397 + }, + { + "epoch": 0.7026136613222052, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.29203462600708, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.71608567237854, + "num_tokens": 165731850.0, + "step": 6398 + }, + { + "epoch": 0.7027234790248188, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.392819404602051, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6973620653152466, + "num_tokens": 165757577.0, + "step": 6399 + }, + { + "epoch": 0.7028332967274324, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2278428077697754, + "learning_rate": 1e-06, + "loss": 1.1169, + "mean_token_accuracy": 0.6738011240959167, + "num_tokens": 165787726.0, + "step": 6400 + }, + { + "epoch": 0.7029431144300461, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.557204484939575, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7030196785926819, + "num_tokens": 165810931.0, + "step": 6401 + }, + { + "epoch": 0.7030529321326597, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3774397373199463, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6889216303825378, + "num_tokens": 165837238.0, + "step": 6402 + }, + { + "epoch": 0.7031627498352735, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1204283237457275, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.705860435962677, + "num_tokens": 165867065.0, + "step": 6403 + }, + { + "epoch": 0.7032725675378871, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2261390686035156, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6925922632217407, + "num_tokens": 165895370.0, + "step": 6404 + }, + { + "epoch": 0.7033823852405008, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.0949301719665527, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6903672218322754, + "num_tokens": 165928520.0, + "step": 6405 + }, + { + "epoch": 0.7034922029431144, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3776767253875732, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6898652911186218, + "num_tokens": 165952076.0, + "step": 6406 + }, + { + "epoch": 0.7036020206457281, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.208134651184082, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7037703394889832, + "num_tokens": 165978496.0, + "step": 6407 + }, + { + "epoch": 0.7037118383483417, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4208197593688965, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.701263964176178, + "num_tokens": 166000883.0, + "step": 6408 + }, + { + "epoch": 0.7038216560509554, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2454495429992676, + "learning_rate": 1e-06, + "loss": 1.0988, + "mean_token_accuracy": 0.6791335344314575, + "num_tokens": 166029905.0, + "step": 6409 + }, + { + "epoch": 0.7039314737535691, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4599571228027344, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7025685906410217, + "num_tokens": 166052445.0, + "step": 6410 + }, + { + "epoch": 0.7040412914561828, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3150603771209717, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6986631155014038, + "num_tokens": 166080139.0, + "step": 6411 + }, + { + "epoch": 0.7041511091587964, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2685601711273193, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7045173048973083, + "num_tokens": 166109391.0, + "step": 6412 + }, + { + "epoch": 0.7042609268614101, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.541269302368164, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.677057147026062, + "num_tokens": 166138729.0, + "step": 6413 + }, + { + "epoch": 0.7043707445640237, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4763388633728027, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6835265755653381, + "num_tokens": 166164711.0, + "step": 6414 + }, + { + "epoch": 0.7044805622666374, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.902463912963867, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7171714901924133, + "num_tokens": 166182875.0, + "step": 6415 + }, + { + "epoch": 0.704590379969251, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.276212692260742, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.703263521194458, + "num_tokens": 166209426.0, + "step": 6416 + }, + { + "epoch": 0.7047001976718648, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3063724040985107, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6813576817512512, + "num_tokens": 166234077.0, + "step": 6417 + }, + { + "epoch": 0.7048100153744784, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.1354005336761475, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.699341356754303, + "num_tokens": 166263591.0, + "step": 6418 + }, + { + "epoch": 0.704919833077092, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.399225950241089, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7078845500946045, + "num_tokens": 166287278.0, + "step": 6419 + }, + { + "epoch": 0.7050296507797057, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.5471994876861572, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.729961633682251, + "num_tokens": 166309647.0, + "step": 6420 + }, + { + "epoch": 0.7051394684823193, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3060314655303955, + "learning_rate": 1e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6841928362846375, + "num_tokens": 166336724.0, + "step": 6421 + }, + { + "epoch": 0.705249286184933, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.544908046722412, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7031807899475098, + "num_tokens": 166361211.0, + "step": 6422 + }, + { + "epoch": 0.7053591038875466, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.153336524963379, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7236238718032837, + "num_tokens": 166385822.0, + "step": 6423 + }, + { + "epoch": 0.7054689215901603, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4190421104431152, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7409852743148804, + "num_tokens": 166407055.0, + "step": 6424 + }, + { + "epoch": 0.705578739292774, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.390545129776001, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6944316625595093, + "num_tokens": 166430708.0, + "step": 6425 + }, + { + "epoch": 0.7056885569953877, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.3788743019104004, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.701300323009491, + "num_tokens": 166456779.0, + "step": 6426 + }, + { + "epoch": 0.7057983746980013, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.333091974258423, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7085139155387878, + "num_tokens": 166481233.0, + "step": 6427 + }, + { + "epoch": 0.705908192400615, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.647242307662964, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7170926332473755, + "num_tokens": 166501202.0, + "step": 6428 + }, + { + "epoch": 0.7060180101032286, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.1512537002563477, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7002389430999756, + "num_tokens": 166531152.0, + "step": 6429 + }, + { + "epoch": 0.7061278278058423, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.429847240447998, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7151802778244019, + "num_tokens": 166553589.0, + "step": 6430 + }, + { + "epoch": 0.7062376455084559, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.3646299839019775, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7100770473480225, + "num_tokens": 166578512.0, + "step": 6431 + }, + { + "epoch": 0.7063474632110697, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.544755697250366, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7113771438598633, + "num_tokens": 166599150.0, + "step": 6432 + }, + { + "epoch": 0.7064572809136833, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.298386573791504, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.684473991394043, + "num_tokens": 166626449.0, + "step": 6433 + }, + { + "epoch": 0.706567098616297, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 2.4807827472686768, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7232476472854614, + "num_tokens": 166648587.0, + "step": 6434 + }, + { + "epoch": 0.7066769163189106, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.5734939575195312, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7207757830619812, + "num_tokens": 166670674.0, + "step": 6435 + }, + { + "epoch": 0.7067867340215243, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.371889352798462, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.704281210899353, + "num_tokens": 166695813.0, + "step": 6436 + }, + { + "epoch": 0.7068965517241379, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.6156833171844482, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.714155912399292, + "num_tokens": 166716119.0, + "step": 6437 + }, + { + "epoch": 0.7070063694267515, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.211320161819458, + "learning_rate": 1e-06, + "loss": 1.1292, + "mean_token_accuracy": 0.6709412336349487, + "num_tokens": 166750666.0, + "step": 6438 + }, + { + "epoch": 0.7071161871293653, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.637895107269287, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7054346799850464, + "num_tokens": 166772775.0, + "step": 6439 + }, + { + "epoch": 0.707226004831979, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.6693220138549805, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7212457656860352, + "num_tokens": 166792522.0, + "step": 6440 + }, + { + "epoch": 0.7073358225345926, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2976677417755127, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6984415054321289, + "num_tokens": 166821634.0, + "step": 6441 + }, + { + "epoch": 0.7074456402372062, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4415700435638428, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6910253763198853, + "num_tokens": 166847977.0, + "step": 6442 + }, + { + "epoch": 0.7075554579398199, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.5371756553649902, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6976228356361389, + "num_tokens": 166871906.0, + "step": 6443 + }, + { + "epoch": 0.7076652756424335, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1042771339416504, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7019942998886108, + "num_tokens": 166903174.0, + "step": 6444 + }, + { + "epoch": 0.7077750933450472, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.293651819229126, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7234252691268921, + "num_tokens": 166927652.0, + "step": 6445 + }, + { + "epoch": 0.7078849110476609, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3818087577819824, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7065025568008423, + "num_tokens": 166951923.0, + "step": 6446 + }, + { + "epoch": 0.7079947287502746, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.660245656967163, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7081496715545654, + "num_tokens": 166973547.0, + "step": 6447 + }, + { + "epoch": 0.7081045464528882, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.647801160812378, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7258315086364746, + "num_tokens": 166992928.0, + "step": 6448 + }, + { + "epoch": 0.7082143641555019, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.509134292602539, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7158542275428772, + "num_tokens": 167014443.0, + "step": 6449 + }, + { + "epoch": 0.7083241818581155, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2473878860473633, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6988147497177124, + "num_tokens": 167040164.0, + "step": 6450 + }, + { + "epoch": 0.7084339995607292, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.2848315238952637, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.696440577507019, + "num_tokens": 167065933.0, + "step": 6451 + }, + { + "epoch": 0.7085438172633428, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.7809858322143555, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7111433148384094, + "num_tokens": 167083429.0, + "step": 6452 + }, + { + "epoch": 0.7086536349659565, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.4073293209075928, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7079243063926697, + "num_tokens": 167108697.0, + "step": 6453 + }, + { + "epoch": 0.7087634526685702, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1174280643463135, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7136300206184387, + "num_tokens": 167138333.0, + "step": 6454 + }, + { + "epoch": 0.7088732703711839, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1974406242370605, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6990575790405273, + "num_tokens": 167166755.0, + "step": 6455 + }, + { + "epoch": 0.7089830880737975, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.5146093368530273, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7150110006332397, + "num_tokens": 167189785.0, + "step": 6456 + }, + { + "epoch": 0.7090929057764112, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.65682315826416, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.71709144115448, + "num_tokens": 167211804.0, + "step": 6457 + }, + { + "epoch": 0.7092027234790248, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.508578062057495, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6943051815032959, + "num_tokens": 167233908.0, + "step": 6458 + }, + { + "epoch": 0.7093125411816384, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4244954586029053, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7058543562889099, + "num_tokens": 167258245.0, + "step": 6459 + }, + { + "epoch": 0.7094223588842521, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2752857208251953, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6968043446540833, + "num_tokens": 167284185.0, + "step": 6460 + }, + { + "epoch": 0.7095321765868658, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3575432300567627, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7069398164749146, + "num_tokens": 167309018.0, + "step": 6461 + }, + { + "epoch": 0.7096419942894795, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.17722225189209, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7099834680557251, + "num_tokens": 167337963.0, + "step": 6462 + }, + { + "epoch": 0.7097518119920931, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1213555335998535, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6822304725646973, + "num_tokens": 167368817.0, + "step": 6463 + }, + { + "epoch": 0.7098616296947068, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2883267402648926, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7033070921897888, + "num_tokens": 167395048.0, + "step": 6464 + }, + { + "epoch": 0.7099714473973204, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1035537719726562, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7095388174057007, + "num_tokens": 167424637.0, + "step": 6465 + }, + { + "epoch": 0.7100812650999341, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.05436372756958, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6814968585968018, + "num_tokens": 167459751.0, + "step": 6466 + }, + { + "epoch": 0.7101910828025477, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3093228340148926, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7172043919563293, + "num_tokens": 167484847.0, + "step": 6467 + }, + { + "epoch": 0.7103009005051615, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.408512592315674, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7103531956672668, + "num_tokens": 167507780.0, + "step": 6468 + }, + { + "epoch": 0.7104107182077751, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.406240224838257, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7184770107269287, + "num_tokens": 167530304.0, + "step": 6469 + }, + { + "epoch": 0.7105205359103888, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.560457944869995, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6932111978530884, + "num_tokens": 167551363.0, + "step": 6470 + }, + { + "epoch": 0.7106303536130024, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1633639335632324, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6824838519096375, + "num_tokens": 167582669.0, + "step": 6471 + }, + { + "epoch": 0.7107401713156161, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.0957767963409424, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6877333521842957, + "num_tokens": 167613850.0, + "step": 6472 + }, + { + "epoch": 0.7108499890182297, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.417675018310547, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7032734751701355, + "num_tokens": 167646709.0, + "step": 6473 + }, + { + "epoch": 0.7109598067208434, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.227051019668579, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.7092125415802002, + "num_tokens": 167675359.0, + "step": 6474 + }, + { + "epoch": 0.7110696244234571, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.48409104347229, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6973313093185425, + "num_tokens": 167698157.0, + "step": 6475 + }, + { + "epoch": 0.7111794421260708, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.258631467819214, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6934448480606079, + "num_tokens": 167725839.0, + "step": 6476 + }, + { + "epoch": 0.7112892598286844, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 2.382413387298584, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7221692800521851, + "num_tokens": 167749767.0, + "step": 6477 + }, + { + "epoch": 0.711399077531298, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.245973587036133, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.693114161491394, + "num_tokens": 167777055.0, + "step": 6478 + }, + { + "epoch": 0.7115088952339117, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4967455863952637, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7249202132225037, + "num_tokens": 167799343.0, + "step": 6479 + }, + { + "epoch": 0.7116187129365253, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4305591583251953, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7175698280334473, + "num_tokens": 167822339.0, + "step": 6480 + }, + { + "epoch": 0.711728530639139, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2976348400115967, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7242576479911804, + "num_tokens": 167847874.0, + "step": 6481 + }, + { + "epoch": 0.7118383483417526, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.448432683944702, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6927222609519958, + "num_tokens": 167871087.0, + "step": 6482 + }, + { + "epoch": 0.7119481660443664, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.251142740249634, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6892783641815186, + "num_tokens": 167898170.0, + "step": 6483 + }, + { + "epoch": 0.71205798374698, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2593441009521484, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7051619291305542, + "num_tokens": 167923656.0, + "step": 6484 + }, + { + "epoch": 0.7121678014495937, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3172638416290283, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6836264133453369, + "num_tokens": 167949266.0, + "step": 6485 + }, + { + "epoch": 0.7122776191522073, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3535940647125244, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7067473530769348, + "num_tokens": 167972994.0, + "step": 6486 + }, + { + "epoch": 0.712387436854821, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4282052516937256, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7009574174880981, + "num_tokens": 167997392.0, + "step": 6487 + }, + { + "epoch": 0.7124972545574346, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.469175100326538, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6917333602905273, + "num_tokens": 168022751.0, + "step": 6488 + }, + { + "epoch": 0.7126070722600483, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1837408542633057, + "learning_rate": 1e-06, + "loss": 1.1029, + "mean_token_accuracy": 0.6841830015182495, + "num_tokens": 168052042.0, + "step": 6489 + }, + { + "epoch": 0.712716889962662, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.7837724685668945, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7042402625083923, + "num_tokens": 168071391.0, + "step": 6490 + }, + { + "epoch": 0.7128267076652757, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2323904037475586, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6952039003372192, + "num_tokens": 168102465.0, + "step": 6491 + }, + { + "epoch": 0.7129365253678893, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.823516368865967, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6974202394485474, + "num_tokens": 168122278.0, + "step": 6492 + }, + { + "epoch": 0.713046343070503, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2533278465270996, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7075074911117554, + "num_tokens": 168151102.0, + "step": 6493 + }, + { + "epoch": 0.7131561607731166, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.378103494644165, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7318079471588135, + "num_tokens": 168175128.0, + "step": 6494 + }, + { + "epoch": 0.7132659784757303, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.204932689666748, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6737571358680725, + "num_tokens": 168207639.0, + "step": 6495 + }, + { + "epoch": 0.7133757961783439, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.405663013458252, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7041367292404175, + "num_tokens": 168233243.0, + "step": 6496 + }, + { + "epoch": 0.7134856138809577, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.494624137878418, + "learning_rate": 1e-06, + "loss": 1.1173, + "mean_token_accuracy": 0.6764949560165405, + "num_tokens": 168260065.0, + "step": 6497 + }, + { + "epoch": 0.7135954315835713, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3546013832092285, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7183303236961365, + "num_tokens": 168284638.0, + "step": 6498 + }, + { + "epoch": 0.713705249286185, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.462242603302002, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7023489475250244, + "num_tokens": 168310957.0, + "step": 6499 + }, + { + "epoch": 0.7138150669887986, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.4749810695648193, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6823214292526245, + "num_tokens": 168336000.0, + "step": 6500 + }, + { + "epoch": 0.7139248846914122, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.5440595149993896, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7088186740875244, + "num_tokens": 168358893.0, + "step": 6501 + }, + { + "epoch": 0.7140347023940259, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3605458736419678, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7241062521934509, + "num_tokens": 168382930.0, + "step": 6502 + }, + { + "epoch": 0.7141445200966395, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.338146448135376, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6875057220458984, + "num_tokens": 168410606.0, + "step": 6503 + }, + { + "epoch": 0.7142543377992533, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.1560587882995605, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7064313888549805, + "num_tokens": 168441184.0, + "step": 6504 + }, + { + "epoch": 0.7143641555018669, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.5275652408599854, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7125693559646606, + "num_tokens": 168464355.0, + "step": 6505 + }, + { + "epoch": 0.7144739732044806, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.313278913497925, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7061310410499573, + "num_tokens": 168491266.0, + "step": 6506 + }, + { + "epoch": 0.7145837909070942, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.538231134414673, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6920408010482788, + "num_tokens": 168516585.0, + "step": 6507 + }, + { + "epoch": 0.7146936086097079, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.8897695541381836, + "learning_rate": 1e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6869267225265503, + "num_tokens": 168535196.0, + "step": 6508 + }, + { + "epoch": 0.7148034263123215, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.263138771057129, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7087320685386658, + "num_tokens": 168561111.0, + "step": 6509 + }, + { + "epoch": 0.7149132440149352, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.8059449195861816, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7232843041419983, + "num_tokens": 168578428.0, + "step": 6510 + }, + { + "epoch": 0.7150230617175488, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.192545175552368, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7057141065597534, + "num_tokens": 168607648.0, + "step": 6511 + }, + { + "epoch": 0.7151328794201626, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.31018328666687, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6936931610107422, + "num_tokens": 168636005.0, + "step": 6512 + }, + { + "epoch": 0.7152426971227762, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2367918491363525, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6928345561027527, + "num_tokens": 168664419.0, + "step": 6513 + }, + { + "epoch": 0.7153525148253899, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.383136749267578, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7242695689201355, + "num_tokens": 168687639.0, + "step": 6514 + }, + { + "epoch": 0.7154623325280035, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.7414634227752686, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.7028748989105225, + "num_tokens": 168712456.0, + "step": 6515 + }, + { + "epoch": 0.7155721502306172, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.2603251934051514, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7177414298057556, + "num_tokens": 168740946.0, + "step": 6516 + }, + { + "epoch": 0.7156819679332308, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.261831045150757, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6848487257957458, + "num_tokens": 168769603.0, + "step": 6517 + }, + { + "epoch": 0.7157917856358444, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.3612358570098877, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7243427634239197, + "num_tokens": 168795783.0, + "step": 6518 + }, + { + "epoch": 0.7159016033384582, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.9552624225616455, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7285410761833191, + "num_tokens": 168813454.0, + "step": 6519 + }, + { + "epoch": 0.7160114210410718, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.6874518394470215, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.717110276222229, + "num_tokens": 168834759.0, + "step": 6520 + }, + { + "epoch": 0.7161212387436855, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 2.399811029434204, + "learning_rate": 1e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.6860677599906921, + "num_tokens": 168861563.0, + "step": 6521 + }, + { + "epoch": 0.7162310564462991, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2779877185821533, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7034077644348145, + "num_tokens": 168888884.0, + "step": 6522 + }, + { + "epoch": 0.7163408741489128, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.182419776916504, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7054581642150879, + "num_tokens": 168917774.0, + "step": 6523 + }, + { + "epoch": 0.7164506918515264, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.659071445465088, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7112647294998169, + "num_tokens": 168936506.0, + "step": 6524 + }, + { + "epoch": 0.7165605095541401, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.299867630004883, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6944637298583984, + "num_tokens": 168963308.0, + "step": 6525 + }, + { + "epoch": 0.7166703272567538, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.612187385559082, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7285047769546509, + "num_tokens": 168985502.0, + "step": 6526 + }, + { + "epoch": 0.7167801449593675, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4932472705841064, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.735795259475708, + "num_tokens": 169006486.0, + "step": 6527 + }, + { + "epoch": 0.7168899626619811, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3203773498535156, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6984634399414062, + "num_tokens": 169030895.0, + "step": 6528 + }, + { + "epoch": 0.7169997803645948, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.5061581134796143, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7023611068725586, + "num_tokens": 169054078.0, + "step": 6529 + }, + { + "epoch": 0.7171095980672084, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.72016978263855, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.717200517654419, + "num_tokens": 169072668.0, + "step": 6530 + }, + { + "epoch": 0.7172194157698221, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3711493015289307, + "learning_rate": 1e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6912491321563721, + "num_tokens": 169099329.0, + "step": 6531 + }, + { + "epoch": 0.7173292334724357, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4937431812286377, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7091082334518433, + "num_tokens": 169120903.0, + "step": 6532 + }, + { + "epoch": 0.7174390511750495, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2310683727264404, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6861428022384644, + "num_tokens": 169149640.0, + "step": 6533 + }, + { + "epoch": 0.7175488688776631, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.5962324142456055, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7118638753890991, + "num_tokens": 169170780.0, + "step": 6534 + }, + { + "epoch": 0.7176586865802768, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2684528827667236, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6904211640357971, + "num_tokens": 169200303.0, + "step": 6535 + }, + { + "epoch": 0.7177685042828904, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.103065013885498, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7227562665939331, + "num_tokens": 169229281.0, + "step": 6536 + }, + { + "epoch": 0.717878321985504, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4495224952697754, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7208822965621948, + "num_tokens": 169252296.0, + "step": 6537 + }, + { + "epoch": 0.7179881396881177, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.377345085144043, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6869536638259888, + "num_tokens": 169279224.0, + "step": 6538 + }, + { + "epoch": 0.7180979573907313, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.405377149581909, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6981309652328491, + "num_tokens": 169307045.0, + "step": 6539 + }, + { + "epoch": 0.7182077750933451, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4858150482177734, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7030519247055054, + "num_tokens": 169333309.0, + "step": 6540 + }, + { + "epoch": 0.7183175927959587, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3854358196258545, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6871852278709412, + "num_tokens": 169362714.0, + "step": 6541 + }, + { + "epoch": 0.7184274104985724, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.472294807434082, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6919874548912048, + "num_tokens": 169390450.0, + "step": 6542 + }, + { + "epoch": 0.718537228201186, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2159037590026855, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6927752494812012, + "num_tokens": 169419864.0, + "step": 6543 + }, + { + "epoch": 0.7186470459037997, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.474348545074463, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7328885793685913, + "num_tokens": 169441594.0, + "step": 6544 + }, + { + "epoch": 0.7187568636064133, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.38761830329895, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7011956572532654, + "num_tokens": 169467608.0, + "step": 6545 + }, + { + "epoch": 0.718866681309027, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4022908210754395, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6968687772750854, + "num_tokens": 169494768.0, + "step": 6546 + }, + { + "epoch": 0.7189764990116406, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.24222731590271, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6981860399246216, + "num_tokens": 169521616.0, + "step": 6547 + }, + { + "epoch": 0.7190863167142544, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.560243844985962, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7186230421066284, + "num_tokens": 169542364.0, + "step": 6548 + }, + { + "epoch": 0.719196134416868, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.248863697052002, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.676542341709137, + "num_tokens": 169574336.0, + "step": 6549 + }, + { + "epoch": 0.7193059521194817, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.1254019737243652, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7006759643554688, + "num_tokens": 169602383.0, + "step": 6550 + }, + { + "epoch": 0.7194157698220953, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4245669841766357, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7013622522354126, + "num_tokens": 169625680.0, + "step": 6551 + }, + { + "epoch": 0.719525587524709, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4286270141601562, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7099246978759766, + "num_tokens": 169649483.0, + "step": 6552 + }, + { + "epoch": 0.7196354052273226, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.247730016708374, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7143236398696899, + "num_tokens": 169676176.0, + "step": 6553 + }, + { + "epoch": 0.7197452229299363, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4962029457092285, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7018396258354187, + "num_tokens": 169700230.0, + "step": 6554 + }, + { + "epoch": 0.71985504063255, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.443765640258789, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6979436874389648, + "num_tokens": 169724249.0, + "step": 6555 + }, + { + "epoch": 0.7199648583351637, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.398172378540039, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6974338293075562, + "num_tokens": 169750728.0, + "step": 6556 + }, + { + "epoch": 0.7200746760377773, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.126168727874756, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6989251375198364, + "num_tokens": 169782386.0, + "step": 6557 + }, + { + "epoch": 0.720184493740391, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2592639923095703, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7186186909675598, + "num_tokens": 169809532.0, + "step": 6558 + }, + { + "epoch": 0.7202943114430046, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2919695377349854, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6949175596237183, + "num_tokens": 169838593.0, + "step": 6559 + }, + { + "epoch": 0.7204041291456182, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2467434406280518, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7089295983314514, + "num_tokens": 169865464.0, + "step": 6560 + }, + { + "epoch": 0.7205139468482319, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.1550486087799072, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7242851257324219, + "num_tokens": 169894499.0, + "step": 6561 + }, + { + "epoch": 0.7206237645508456, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.212630271911621, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6921476721763611, + "num_tokens": 169922616.0, + "step": 6562 + }, + { + "epoch": 0.7207335822534593, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2696213722229004, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6822953224182129, + "num_tokens": 169950672.0, + "step": 6563 + }, + { + "epoch": 0.7208433999560729, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3738131523132324, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6951163411140442, + "num_tokens": 169974535.0, + "step": 6564 + }, + { + "epoch": 0.7209532176586866, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4699087142944336, + "learning_rate": 1e-06, + "loss": 1.121, + "mean_token_accuracy": 0.677562952041626, + "num_tokens": 170002421.0, + "step": 6565 + }, + { + "epoch": 0.7210630353613002, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.121929883956909, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7166973948478699, + "num_tokens": 170032139.0, + "step": 6566 + }, + { + "epoch": 0.7211728530639139, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.5909788608551025, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7026220560073853, + "num_tokens": 170051814.0, + "step": 6567 + }, + { + "epoch": 0.7212826707665275, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.1375205516815186, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7126203775405884, + "num_tokens": 170080831.0, + "step": 6568 + }, + { + "epoch": 0.7213924884691413, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.1499547958374023, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6986270546913147, + "num_tokens": 170108149.0, + "step": 6569 + }, + { + "epoch": 0.7215023061717549, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2186901569366455, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6829302906990051, + "num_tokens": 170137356.0, + "step": 6570 + }, + { + "epoch": 0.7216121238743686, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.495112419128418, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7067248821258545, + "num_tokens": 170161300.0, + "step": 6571 + }, + { + "epoch": 0.7217219415769822, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4033989906311035, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7229835987091064, + "num_tokens": 170184667.0, + "step": 6572 + }, + { + "epoch": 0.7218317592795959, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.147364854812622, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6899124383926392, + "num_tokens": 170215863.0, + "step": 6573 + }, + { + "epoch": 0.7219415769822095, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.410538673400879, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7103438973426819, + "num_tokens": 170239781.0, + "step": 6574 + }, + { + "epoch": 0.7220513946848232, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.2832930088043213, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6920309066772461, + "num_tokens": 170266272.0, + "step": 6575 + }, + { + "epoch": 0.7221612123874368, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.4414427280426025, + "learning_rate": 1e-06, + "loss": 1.1142, + "mean_token_accuracy": 0.6713000535964966, + "num_tokens": 170293997.0, + "step": 6576 + }, + { + "epoch": 0.7222710300900506, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.470935106277466, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7336353063583374, + "num_tokens": 170316894.0, + "step": 6577 + }, + { + "epoch": 0.7223808477926642, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3178257942199707, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.720798909664154, + "num_tokens": 170343035.0, + "step": 6578 + }, + { + "epoch": 0.7224906654952779, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.3732845783233643, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6948805451393127, + "num_tokens": 170368423.0, + "step": 6579 + }, + { + "epoch": 0.7226004831978915, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.288576602935791, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7215826511383057, + "num_tokens": 170393599.0, + "step": 6580 + }, + { + "epoch": 0.7227103009005051, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.281118392944336, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7248543500900269, + "num_tokens": 170419722.0, + "step": 6581 + }, + { + "epoch": 0.7228201186031188, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.439946413040161, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7084876298904419, + "num_tokens": 170443024.0, + "step": 6582 + }, + { + "epoch": 0.7229299363057324, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.117713212966919, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7069987654685974, + "num_tokens": 170471462.0, + "step": 6583 + }, + { + "epoch": 0.7230397540083462, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.246494770050049, + "learning_rate": 1e-06, + "loss": 1.0996, + "mean_token_accuracy": 0.6814202070236206, + "num_tokens": 170502437.0, + "step": 6584 + }, + { + "epoch": 0.7231495717109598, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.380655527114868, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7160244584083557, + "num_tokens": 170525603.0, + "step": 6585 + }, + { + "epoch": 0.7232593894135735, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1177961826324463, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.699854850769043, + "num_tokens": 170555958.0, + "step": 6586 + }, + { + "epoch": 0.7233692071161871, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.427191972732544, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6984187364578247, + "num_tokens": 170579904.0, + "step": 6587 + }, + { + "epoch": 0.7234790248188008, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.7524845600128174, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7149859070777893, + "num_tokens": 170599073.0, + "step": 6588 + }, + { + "epoch": 0.7235888425214144, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.184096336364746, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7268855571746826, + "num_tokens": 170624812.0, + "step": 6589 + }, + { + "epoch": 0.7236986602240281, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.485772132873535, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6945657730102539, + "num_tokens": 170649253.0, + "step": 6590 + }, + { + "epoch": 0.7238084779266418, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4574925899505615, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6895979642868042, + "num_tokens": 170672443.0, + "step": 6591 + }, + { + "epoch": 0.7239182956292555, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1944942474365234, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6907365322113037, + "num_tokens": 170701394.0, + "step": 6592 + }, + { + "epoch": 0.7240281133318691, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.490283727645874, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.705568790435791, + "num_tokens": 170725227.0, + "step": 6593 + }, + { + "epoch": 0.7241379310344828, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.591552734375, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7064549922943115, + "num_tokens": 170747080.0, + "step": 6594 + }, + { + "epoch": 0.7242477487370964, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.28920841217041, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7208472490310669, + "num_tokens": 170769740.0, + "step": 6595 + }, + { + "epoch": 0.72435756643971, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.425245761871338, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.688339114189148, + "num_tokens": 170795664.0, + "step": 6596 + }, + { + "epoch": 0.7244673841423237, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1314878463745117, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7128395438194275, + "num_tokens": 170824411.0, + "step": 6597 + }, + { + "epoch": 0.7245772018449375, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.431001663208008, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6851236820220947, + "num_tokens": 170850123.0, + "step": 6598 + }, + { + "epoch": 0.7246870195475511, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4881858825683594, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7000830173492432, + "num_tokens": 170873265.0, + "step": 6599 + }, + { + "epoch": 0.7247968372501647, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2919363975524902, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6959168910980225, + "num_tokens": 170899152.0, + "step": 6600 + }, + { + "epoch": 0.7249066549527784, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1742076873779297, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6940404176712036, + "num_tokens": 170929558.0, + "step": 6601 + }, + { + "epoch": 0.725016472655392, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.317660093307495, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7167567610740662, + "num_tokens": 170955891.0, + "step": 6602 + }, + { + "epoch": 0.7251262903580057, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.191829204559326, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.7135259509086609, + "num_tokens": 170984853.0, + "step": 6603 + }, + { + "epoch": 0.7252361080606193, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.088486909866333, + "learning_rate": 1e-06, + "loss": 1.1302, + "mean_token_accuracy": 0.6751739978790283, + "num_tokens": 171017608.0, + "step": 6604 + }, + { + "epoch": 0.725345925763233, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4852828979492188, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6963667869567871, + "num_tokens": 171043941.0, + "step": 6605 + }, + { + "epoch": 0.7254557434658467, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4862356185913086, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6923689842224121, + "num_tokens": 171067231.0, + "step": 6606 + }, + { + "epoch": 0.7255655611684604, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1955580711364746, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6960374116897583, + "num_tokens": 171094751.0, + "step": 6607 + }, + { + "epoch": 0.725675378871074, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3119704723358154, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6895971894264221, + "num_tokens": 171121793.0, + "step": 6608 + }, + { + "epoch": 0.7257851965736877, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3824849128723145, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.692393958568573, + "num_tokens": 171147151.0, + "step": 6609 + }, + { + "epoch": 0.7258950142763013, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3012237548828125, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6987413167953491, + "num_tokens": 171173165.0, + "step": 6610 + }, + { + "epoch": 0.726004831978915, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4944610595703125, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.72149658203125, + "num_tokens": 171195083.0, + "step": 6611 + }, + { + "epoch": 0.7261146496815286, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1689453125, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6850878000259399, + "num_tokens": 171227814.0, + "step": 6612 + }, + { + "epoch": 0.7262244673841424, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.318751811981201, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6907066702842712, + "num_tokens": 171255157.0, + "step": 6613 + }, + { + "epoch": 0.726334285086756, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2837655544281006, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7060220241546631, + "num_tokens": 171279921.0, + "step": 6614 + }, + { + "epoch": 0.7264441027893697, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.401172399520874, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6878407001495361, + "num_tokens": 171306367.0, + "step": 6615 + }, + { + "epoch": 0.7265539204919833, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2402102947235107, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.687122106552124, + "num_tokens": 171335470.0, + "step": 6616 + }, + { + "epoch": 0.726663738194597, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.351045608520508, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6936131715774536, + "num_tokens": 171360291.0, + "step": 6617 + }, + { + "epoch": 0.7267735558972106, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2911438941955566, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7060153484344482, + "num_tokens": 171386241.0, + "step": 6618 + }, + { + "epoch": 0.7268833735998242, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3508102893829346, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6869567632675171, + "num_tokens": 171412587.0, + "step": 6619 + }, + { + "epoch": 0.726993191302438, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.561352491378784, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7019985914230347, + "num_tokens": 171434516.0, + "step": 6620 + }, + { + "epoch": 0.7271030090050516, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2147746086120605, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6834479570388794, + "num_tokens": 171464916.0, + "step": 6621 + }, + { + "epoch": 0.7272128267076653, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4113283157348633, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6942358016967773, + "num_tokens": 171489467.0, + "step": 6622 + }, + { + "epoch": 0.7273226444102789, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.8242123126983643, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7356652617454529, + "num_tokens": 171506265.0, + "step": 6623 + }, + { + "epoch": 0.7274324621128926, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.474325180053711, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7260280847549438, + "num_tokens": 171527160.0, + "step": 6624 + }, + { + "epoch": 0.7275422798155062, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2972514629364014, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7105603218078613, + "num_tokens": 171552198.0, + "step": 6625 + }, + { + "epoch": 0.7276520975181199, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.592573881149292, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7065585851669312, + "num_tokens": 171573001.0, + "step": 6626 + }, + { + "epoch": 0.7277619152207336, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1916375160217285, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.687909722328186, + "num_tokens": 171602431.0, + "step": 6627 + }, + { + "epoch": 0.7278717329233473, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2979400157928467, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6999939680099487, + "num_tokens": 171631809.0, + "step": 6628 + }, + { + "epoch": 0.7279815506259609, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4272804260253906, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6988950967788696, + "num_tokens": 171655463.0, + "step": 6629 + }, + { + "epoch": 0.7280913683285746, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4053232669830322, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7185029983520508, + "num_tokens": 171679233.0, + "step": 6630 + }, + { + "epoch": 0.7282011860311882, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3236212730407715, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7113370299339294, + "num_tokens": 171704875.0, + "step": 6631 + }, + { + "epoch": 0.7283110037338019, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.111408233642578, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7124145030975342, + "num_tokens": 171736957.0, + "step": 6632 + }, + { + "epoch": 0.7284208214364155, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.250234603881836, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6803543567657471, + "num_tokens": 171765231.0, + "step": 6633 + }, + { + "epoch": 0.7285306391390292, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.511552572250366, + "learning_rate": 1e-06, + "loss": 1.0886, + "mean_token_accuracy": 0.6857119798660278, + "num_tokens": 171790280.0, + "step": 6634 + }, + { + "epoch": 0.7286404568416429, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2813010215759277, + "learning_rate": 1e-06, + "loss": 1.1125, + "mean_token_accuracy": 0.6824742555618286, + "num_tokens": 171823396.0, + "step": 6635 + }, + { + "epoch": 0.7287502745442566, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.206403970718384, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7115199565887451, + "num_tokens": 171852932.0, + "step": 6636 + }, + { + "epoch": 0.7288600922468702, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.185051918029785, + "learning_rate": 1e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.6743413209915161, + "num_tokens": 171882645.0, + "step": 6637 + }, + { + "epoch": 0.7289699099494839, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3671791553497314, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6861962080001831, + "num_tokens": 171907999.0, + "step": 6638 + }, + { + "epoch": 0.7290797276520975, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.575199604034424, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7166705131530762, + "num_tokens": 171929437.0, + "step": 6639 + }, + { + "epoch": 0.7291895453547111, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3398494720458984, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7183157205581665, + "num_tokens": 171952768.0, + "step": 6640 + }, + { + "epoch": 0.7292993630573248, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.133009910583496, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6949540376663208, + "num_tokens": 171982817.0, + "step": 6641 + }, + { + "epoch": 0.7294091807599385, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3912250995635986, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7023313045501709, + "num_tokens": 172008264.0, + "step": 6642 + }, + { + "epoch": 0.7295189984625522, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.173982858657837, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.698185384273529, + "num_tokens": 172037187.0, + "step": 6643 + }, + { + "epoch": 0.7296288161651658, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.475775957107544, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6978691220283508, + "num_tokens": 172061265.0, + "step": 6644 + }, + { + "epoch": 0.7297386338677795, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3755345344543457, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7080810070037842, + "num_tokens": 172088106.0, + "step": 6645 + }, + { + "epoch": 0.7298484515703931, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3846590518951416, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6951623558998108, + "num_tokens": 172111278.0, + "step": 6646 + }, + { + "epoch": 0.7299582692730068, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2597196102142334, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6933802366256714, + "num_tokens": 172139161.0, + "step": 6647 + }, + { + "epoch": 0.7300680869756204, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.329683542251587, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6832830905914307, + "num_tokens": 172164497.0, + "step": 6648 + }, + { + "epoch": 0.7301779046782342, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1036338806152344, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6872266530990601, + "num_tokens": 172195776.0, + "step": 6649 + }, + { + "epoch": 0.7302877223808478, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.253957748413086, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7100326418876648, + "num_tokens": 172222583.0, + "step": 6650 + }, + { + "epoch": 0.7303975400834615, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4159882068634033, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6921414136886597, + "num_tokens": 172247462.0, + "step": 6651 + }, + { + "epoch": 0.7305073577860751, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2347896099090576, + "learning_rate": 1e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6895527243614197, + "num_tokens": 172275377.0, + "step": 6652 + }, + { + "epoch": 0.7306171754886888, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.46441650390625, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7232364416122437, + "num_tokens": 172297610.0, + "step": 6653 + }, + { + "epoch": 0.7307269931913024, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2807505130767822, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7095999717712402, + "num_tokens": 172322826.0, + "step": 6654 + }, + { + "epoch": 0.7308368108939161, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4461281299591064, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6939250826835632, + "num_tokens": 172347222.0, + "step": 6655 + }, + { + "epoch": 0.7309466285965298, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.186350107192993, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.72432541847229, + "num_tokens": 172373660.0, + "step": 6656 + }, + { + "epoch": 0.7310564462991435, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1247496604919434, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6932095885276794, + "num_tokens": 172402897.0, + "step": 6657 + }, + { + "epoch": 0.7311662640017571, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.44447922706604, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7119153738021851, + "num_tokens": 172424908.0, + "step": 6658 + }, + { + "epoch": 0.7312760817043708, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.276942253112793, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7005118131637573, + "num_tokens": 172452343.0, + "step": 6659 + }, + { + "epoch": 0.7313858994069844, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.391268491744995, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7280393838882446, + "num_tokens": 172474687.0, + "step": 6660 + }, + { + "epoch": 0.731495717109598, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.156404972076416, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7142218947410583, + "num_tokens": 172501083.0, + "step": 6661 + }, + { + "epoch": 0.7316055348122117, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2010178565979004, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6917339563369751, + "num_tokens": 172529837.0, + "step": 6662 + }, + { + "epoch": 0.7317153525148253, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.0033791065216064, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6985089778900146, + "num_tokens": 172565242.0, + "step": 6663 + }, + { + "epoch": 0.7318251702174391, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3659276962280273, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7197241187095642, + "num_tokens": 172590053.0, + "step": 6664 + }, + { + "epoch": 0.7319349879200527, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.062467098236084, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7137396335601807, + "num_tokens": 172617737.0, + "step": 6665 + }, + { + "epoch": 0.7320448056226664, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2748656272888184, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7117413878440857, + "num_tokens": 172643719.0, + "step": 6666 + }, + { + "epoch": 0.73215462332528, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.2219834327697754, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7095699310302734, + "num_tokens": 172670802.0, + "step": 6667 + }, + { + "epoch": 0.7322644410278937, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.0880463123321533, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6992040276527405, + "num_tokens": 172701329.0, + "step": 6668 + }, + { + "epoch": 0.7323742587305073, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.505857467651367, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6967483162879944, + "num_tokens": 172723687.0, + "step": 6669 + }, + { + "epoch": 0.732484076433121, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1092302799224854, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6980080008506775, + "num_tokens": 172752354.0, + "step": 6670 + }, + { + "epoch": 0.7325938941357347, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.414966344833374, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6967905759811401, + "num_tokens": 172776323.0, + "step": 6671 + }, + { + "epoch": 0.7327037118383484, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.3651983737945557, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.712045431137085, + "num_tokens": 172800904.0, + "step": 6672 + }, + { + "epoch": 0.732813529540962, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.4433324337005615, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7107886075973511, + "num_tokens": 172823370.0, + "step": 6673 + }, + { + "epoch": 0.7329233472435757, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.453598737716675, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.70952308177948, + "num_tokens": 172846258.0, + "step": 6674 + }, + { + "epoch": 0.7330331649461893, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.6423912048339844, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7026630640029907, + "num_tokens": 172867940.0, + "step": 6675 + }, + { + "epoch": 0.733142982648803, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.81644606590271, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6996895670890808, + "num_tokens": 172886700.0, + "step": 6676 + }, + { + "epoch": 0.7332528003514166, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3156566619873047, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7022113800048828, + "num_tokens": 172914941.0, + "step": 6677 + }, + { + "epoch": 0.7333626180540304, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.465367555618286, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7156988382339478, + "num_tokens": 172939971.0, + "step": 6678 + }, + { + "epoch": 0.733472435756644, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.237553596496582, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7081880569458008, + "num_tokens": 172968136.0, + "step": 6679 + }, + { + "epoch": 0.7335822534592576, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.35689115524292, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.7016672492027283, + "num_tokens": 172993683.0, + "step": 6680 + }, + { + "epoch": 0.7336920711618713, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.45192551612854, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6983476877212524, + "num_tokens": 173018448.0, + "step": 6681 + }, + { + "epoch": 0.7338018888644849, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.336505174636841, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6994656324386597, + "num_tokens": 173044670.0, + "step": 6682 + }, + { + "epoch": 0.7339117065670986, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3059351444244385, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6920360326766968, + "num_tokens": 173072158.0, + "step": 6683 + }, + { + "epoch": 0.7340215242697122, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.0684685707092285, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7079981565475464, + "num_tokens": 173101918.0, + "step": 6684 + }, + { + "epoch": 0.734131341972326, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3566246032714844, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6978078484535217, + "num_tokens": 173127160.0, + "step": 6685 + }, + { + "epoch": 0.7342411596749396, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2451674938201904, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6898249387741089, + "num_tokens": 173153811.0, + "step": 6686 + }, + { + "epoch": 0.7343509773775533, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4113717079162598, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6837402582168579, + "num_tokens": 173178814.0, + "step": 6687 + }, + { + "epoch": 0.7344607950801669, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.5740621089935303, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6913307309150696, + "num_tokens": 173200290.0, + "step": 6688 + }, + { + "epoch": 0.7345706127827806, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.263399839401245, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.69163578748703, + "num_tokens": 173226366.0, + "step": 6689 + }, + { + "epoch": 0.7346804304853942, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3880016803741455, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6951102018356323, + "num_tokens": 173253655.0, + "step": 6690 + }, + { + "epoch": 0.7347902481880079, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.65047550201416, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7168686985969543, + "num_tokens": 173275181.0, + "step": 6691 + }, + { + "epoch": 0.7349000658906216, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.2651844024658203, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7095193266868591, + "num_tokens": 173302924.0, + "step": 6692 + }, + { + "epoch": 0.7350098835932353, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3939428329467773, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6966458559036255, + "num_tokens": 173327534.0, + "step": 6693 + }, + { + "epoch": 0.7351197012958489, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.382965564727783, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7175493240356445, + "num_tokens": 173352614.0, + "step": 6694 + }, + { + "epoch": 0.7352295189984626, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.5959761142730713, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7034193277359009, + "num_tokens": 173374986.0, + "step": 6695 + }, + { + "epoch": 0.7353393367010762, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.777472496032715, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7021795511245728, + "num_tokens": 173394416.0, + "step": 6696 + }, + { + "epoch": 0.7354491544036899, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.6350038051605225, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7144356966018677, + "num_tokens": 173417035.0, + "step": 6697 + }, + { + "epoch": 0.7355589721063035, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.1372523307800293, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7134720683097839, + "num_tokens": 173447210.0, + "step": 6698 + }, + { + "epoch": 0.7356687898089171, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3272597789764404, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6996893882751465, + "num_tokens": 173473313.0, + "step": 6699 + }, + { + "epoch": 0.7357786075115309, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3812673091888428, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7361040711402893, + "num_tokens": 173495252.0, + "step": 6700 + }, + { + "epoch": 0.7358884252141445, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.421586751937866, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.724531352519989, + "num_tokens": 173521442.0, + "step": 6701 + }, + { + "epoch": 0.7359982429167582, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.2473299503326416, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.718850314617157, + "num_tokens": 173547300.0, + "step": 6702 + }, + { + "epoch": 0.7361080606193718, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.236642599105835, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.712551474571228, + "num_tokens": 173576514.0, + "step": 6703 + }, + { + "epoch": 0.7362178783219855, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.5541038513183594, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.7000315189361572, + "num_tokens": 173602159.0, + "step": 6704 + }, + { + "epoch": 0.7363276960245991, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.4090821743011475, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.706132709980011, + "num_tokens": 173627353.0, + "step": 6705 + }, + { + "epoch": 0.7364375137272128, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.303316593170166, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7213335633277893, + "num_tokens": 173655914.0, + "step": 6706 + }, + { + "epoch": 0.7365473314298265, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3500730991363525, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6995932459831238, + "num_tokens": 173682089.0, + "step": 6707 + }, + { + "epoch": 0.7366571491324402, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.2169201374053955, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7179540991783142, + "num_tokens": 173710899.0, + "step": 6708 + }, + { + "epoch": 0.7367669668350538, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4898319244384766, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6965317726135254, + "num_tokens": 173734020.0, + "step": 6709 + }, + { + "epoch": 0.7368767845376675, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.1002416610717773, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.714304506778717, + "num_tokens": 173763730.0, + "step": 6710 + }, + { + "epoch": 0.7369866022402811, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.4768800735473633, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6707656383514404, + "num_tokens": 173789322.0, + "step": 6711 + }, + { + "epoch": 0.7370964199428948, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.0987021923065186, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6875298619270325, + "num_tokens": 173821838.0, + "step": 6712 + }, + { + "epoch": 0.7372062376455084, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.305850028991699, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7170723676681519, + "num_tokens": 173848132.0, + "step": 6713 + }, + { + "epoch": 0.7373160553481222, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.352267265319824, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7294325232505798, + "num_tokens": 173872793.0, + "step": 6714 + }, + { + "epoch": 0.7374258730507358, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3866682052612305, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7144079208374023, + "num_tokens": 173897476.0, + "step": 6715 + }, + { + "epoch": 0.7375356907533495, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 2.3911190032958984, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7018477916717529, + "num_tokens": 173921544.0, + "step": 6716 + }, + { + "epoch": 0.7376455084559631, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5116806030273438, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7162575125694275, + "num_tokens": 173942604.0, + "step": 6717 + }, + { + "epoch": 0.7377553261585768, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3102986812591553, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7037976384162903, + "num_tokens": 173967385.0, + "step": 6718 + }, + { + "epoch": 0.7378651438611904, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.523322582244873, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7127108573913574, + "num_tokens": 173989018.0, + "step": 6719 + }, + { + "epoch": 0.737974961563804, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4110448360443115, + "learning_rate": 1e-06, + "loss": 1.099, + "mean_token_accuracy": 0.6851872205734253, + "num_tokens": 174014756.0, + "step": 6720 + }, + { + "epoch": 0.7380847792664178, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.7123377323150635, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6929163932800293, + "num_tokens": 174036322.0, + "step": 6721 + }, + { + "epoch": 0.7381945969690314, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.256239175796509, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7278769612312317, + "num_tokens": 174061966.0, + "step": 6722 + }, + { + "epoch": 0.7383044146716451, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.6444146633148193, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6959929466247559, + "num_tokens": 174082133.0, + "step": 6723 + }, + { + "epoch": 0.7384142323742587, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.0994417667388916, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6992853879928589, + "num_tokens": 174114587.0, + "step": 6724 + }, + { + "epoch": 0.7385240500768724, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1416385173797607, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6828908920288086, + "num_tokens": 174144910.0, + "step": 6725 + }, + { + "epoch": 0.738633867779486, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2768502235412598, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7035550475120544, + "num_tokens": 174170044.0, + "step": 6726 + }, + { + "epoch": 0.7387436854820997, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.308340072631836, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7284967303276062, + "num_tokens": 174194128.0, + "step": 6727 + }, + { + "epoch": 0.7388535031847133, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5387630462646484, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7280591130256653, + "num_tokens": 174213613.0, + "step": 6728 + }, + { + "epoch": 0.7389633208873271, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.324526071548462, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.718165397644043, + "num_tokens": 174239072.0, + "step": 6729 + }, + { + "epoch": 0.7390731385899407, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.261528491973877, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.69175124168396, + "num_tokens": 174266710.0, + "step": 6730 + }, + { + "epoch": 0.7391829562925544, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.450887441635132, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7059875726699829, + "num_tokens": 174289578.0, + "step": 6731 + }, + { + "epoch": 0.739292773995168, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4268314838409424, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7052266001701355, + "num_tokens": 174312357.0, + "step": 6732 + }, + { + "epoch": 0.7394025916977817, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.6628329753875732, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.708748996257782, + "num_tokens": 174333509.0, + "step": 6733 + }, + { + "epoch": 0.7395124094003953, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.518129587173462, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7221370935440063, + "num_tokens": 174354178.0, + "step": 6734 + }, + { + "epoch": 0.739622227103009, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.534337043762207, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7007774710655212, + "num_tokens": 174377254.0, + "step": 6735 + }, + { + "epoch": 0.7397320448056227, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4665493965148926, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7104253172874451, + "num_tokens": 174400581.0, + "step": 6736 + }, + { + "epoch": 0.7398418625082364, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.134065628051758, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6968363523483276, + "num_tokens": 174431436.0, + "step": 6737 + }, + { + "epoch": 0.73995168021085, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5439155101776123, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.7025455832481384, + "num_tokens": 174456476.0, + "step": 6738 + }, + { + "epoch": 0.7400614979134637, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.561854839324951, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6854081153869629, + "num_tokens": 174479634.0, + "step": 6739 + }, + { + "epoch": 0.7401713156160773, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.379673480987549, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.702759861946106, + "num_tokens": 174504223.0, + "step": 6740 + }, + { + "epoch": 0.7402811333186909, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2002370357513428, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7237436771392822, + "num_tokens": 174532251.0, + "step": 6741 + }, + { + "epoch": 0.7403909510213046, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.649252414703369, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7176156044006348, + "num_tokens": 174551363.0, + "step": 6742 + }, + { + "epoch": 0.7405007687239183, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.9918142557144165, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7107152938842773, + "num_tokens": 174585552.0, + "step": 6743 + }, + { + "epoch": 0.740610586426532, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.391883134841919, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6910804510116577, + "num_tokens": 174612006.0, + "step": 6744 + }, + { + "epoch": 0.7407204041291456, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4476382732391357, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6964666247367859, + "num_tokens": 174635438.0, + "step": 6745 + }, + { + "epoch": 0.7408302218317593, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1713359355926514, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6972948312759399, + "num_tokens": 174663764.0, + "step": 6746 + }, + { + "epoch": 0.7409400395343729, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.457298755645752, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7068911194801331, + "num_tokens": 174686855.0, + "step": 6747 + }, + { + "epoch": 0.7410498572369866, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.460676431655884, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6924213171005249, + "num_tokens": 174712408.0, + "step": 6748 + }, + { + "epoch": 0.7411596749396002, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.092008352279663, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.697203516960144, + "num_tokens": 174746301.0, + "step": 6749 + }, + { + "epoch": 0.741269492642214, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.254286766052246, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7048258185386658, + "num_tokens": 174772707.0, + "step": 6750 + }, + { + "epoch": 0.7413793103448276, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1820852756500244, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6990616321563721, + "num_tokens": 174800887.0, + "step": 6751 + }, + { + "epoch": 0.7414891280474413, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.330796718597412, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7183238863945007, + "num_tokens": 174825484.0, + "step": 6752 + }, + { + "epoch": 0.7415989457500549, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2103817462921143, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6948912143707275, + "num_tokens": 174856788.0, + "step": 6753 + }, + { + "epoch": 0.7417087634526686, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2491326332092285, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7149984836578369, + "num_tokens": 174883870.0, + "step": 6754 + }, + { + "epoch": 0.7418185811552822, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3124892711639404, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7121671438217163, + "num_tokens": 174911277.0, + "step": 6755 + }, + { + "epoch": 0.7419283988578959, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2661044597625732, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7149782776832581, + "num_tokens": 174937395.0, + "step": 6756 + }, + { + "epoch": 0.7420382165605095, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4213271141052246, + "learning_rate": 1e-06, + "loss": 1.0753, + "mean_token_accuracy": 0.6841294169425964, + "num_tokens": 174962830.0, + "step": 6757 + }, + { + "epoch": 0.7421480342631233, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1902947425842285, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6997862458229065, + "num_tokens": 174989345.0, + "step": 6758 + }, + { + "epoch": 0.7422578519657369, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4613986015319824, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7195842862129211, + "num_tokens": 175012986.0, + "step": 6759 + }, + { + "epoch": 0.7423676696683505, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.8230721950531006, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7081377506256104, + "num_tokens": 175032096.0, + "step": 6760 + }, + { + "epoch": 0.7424774873709642, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.8095641136169434, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.731070876121521, + "num_tokens": 175051955.0, + "step": 6761 + }, + { + "epoch": 0.7425873050735778, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3795509338378906, + "learning_rate": 1e-06, + "loss": 1.1122, + "mean_token_accuracy": 0.6776080131530762, + "num_tokens": 175078938.0, + "step": 6762 + }, + { + "epoch": 0.7426971227761915, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.203845500946045, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7184355854988098, + "num_tokens": 175104952.0, + "step": 6763 + }, + { + "epoch": 0.7428069404788051, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4017300605773926, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6963613033294678, + "num_tokens": 175129499.0, + "step": 6764 + }, + { + "epoch": 0.7429167581814189, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.373865842819214, + "learning_rate": 1e-06, + "loss": 1.1111, + "mean_token_accuracy": 0.6744612455368042, + "num_tokens": 175155643.0, + "step": 6765 + }, + { + "epoch": 0.7430265758840325, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4255852699279785, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6967206001281738, + "num_tokens": 175179215.0, + "step": 6766 + }, + { + "epoch": 0.7431363935866462, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1983156204223633, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6954059600830078, + "num_tokens": 175208654.0, + "step": 6767 + }, + { + "epoch": 0.7432462112892598, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.294389009475708, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6992636322975159, + "num_tokens": 175235570.0, + "step": 6768 + }, + { + "epoch": 0.7433560289918735, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3791327476501465, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6940444707870483, + "num_tokens": 175260521.0, + "step": 6769 + }, + { + "epoch": 0.7434658466944871, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.289533853530884, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7181345224380493, + "num_tokens": 175287083.0, + "step": 6770 + }, + { + "epoch": 0.7435756643971008, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.563492774963379, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7142874002456665, + "num_tokens": 175309073.0, + "step": 6771 + }, + { + "epoch": 0.7436854820997145, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.73533034324646, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7067151069641113, + "num_tokens": 175327506.0, + "step": 6772 + }, + { + "epoch": 0.7437952998023282, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.637956380844116, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7074492573738098, + "num_tokens": 175350031.0, + "step": 6773 + }, + { + "epoch": 0.7439051175049418, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.471487045288086, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7078109979629517, + "num_tokens": 175371750.0, + "step": 6774 + }, + { + "epoch": 0.7440149352075555, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.295538902282715, + "learning_rate": 1e-06, + "loss": 1.0915, + "mean_token_accuracy": 0.6762010455131531, + "num_tokens": 175398988.0, + "step": 6775 + }, + { + "epoch": 0.7441247529101691, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.144258499145508, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7083588242530823, + "num_tokens": 175427102.0, + "step": 6776 + }, + { + "epoch": 0.7442345706127828, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2798078060150146, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7031053900718689, + "num_tokens": 175454098.0, + "step": 6777 + }, + { + "epoch": 0.7443443883153964, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2055275440216064, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6879550218582153, + "num_tokens": 175481485.0, + "step": 6778 + }, + { + "epoch": 0.7444542060180102, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4532864093780518, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7029968500137329, + "num_tokens": 175505092.0, + "step": 6779 + }, + { + "epoch": 0.7445640237206238, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.369980573654175, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6957084536552429, + "num_tokens": 175529187.0, + "step": 6780 + }, + { + "epoch": 0.7446738414232374, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.505603075027466, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6863067746162415, + "num_tokens": 175554714.0, + "step": 6781 + }, + { + "epoch": 0.7447836591258511, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.276925563812256, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6959470510482788, + "num_tokens": 175582796.0, + "step": 6782 + }, + { + "epoch": 0.7448934768284647, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4243123531341553, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6995241641998291, + "num_tokens": 175606081.0, + "step": 6783 + }, + { + "epoch": 0.7450032945310784, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2896506786346436, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7200813293457031, + "num_tokens": 175632000.0, + "step": 6784 + }, + { + "epoch": 0.745113112233692, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2351934909820557, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7142390012741089, + "num_tokens": 175658364.0, + "step": 6785 + }, + { + "epoch": 0.7452229299363057, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4760165214538574, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7084572315216064, + "num_tokens": 175683050.0, + "step": 6786 + }, + { + "epoch": 0.7453327476389194, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.432598352432251, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7147746682167053, + "num_tokens": 175706060.0, + "step": 6787 + }, + { + "epoch": 0.7454425653415331, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1932854652404785, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7136403322219849, + "num_tokens": 175734301.0, + "step": 6788 + }, + { + "epoch": 0.7455523830441467, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3808352947235107, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7188284397125244, + "num_tokens": 175757178.0, + "step": 6789 + }, + { + "epoch": 0.7456622007467604, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.7138257026672363, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6869463324546814, + "num_tokens": 175778863.0, + "step": 6790 + }, + { + "epoch": 0.745772018449374, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2641818523406982, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7090197801589966, + "num_tokens": 175806023.0, + "step": 6791 + }, + { + "epoch": 0.7458818361519877, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5402023792266846, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.702570915222168, + "num_tokens": 175828049.0, + "step": 6792 + }, + { + "epoch": 0.7459916538546013, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.7088212966918945, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7061362266540527, + "num_tokens": 175848135.0, + "step": 6793 + }, + { + "epoch": 0.7461014715572151, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.216627836227417, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6994723081588745, + "num_tokens": 175876498.0, + "step": 6794 + }, + { + "epoch": 0.7462112892598287, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3167591094970703, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6969179511070251, + "num_tokens": 175902733.0, + "step": 6795 + }, + { + "epoch": 0.7463211069624424, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5841500759124756, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7056702375411987, + "num_tokens": 175925566.0, + "step": 6796 + }, + { + "epoch": 0.746430924665056, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2375400066375732, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6852457523345947, + "num_tokens": 175955080.0, + "step": 6797 + }, + { + "epoch": 0.7465407423676697, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4708189964294434, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7174099087715149, + "num_tokens": 175977085.0, + "step": 6798 + }, + { + "epoch": 0.7466505600702833, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.385106325149536, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7073330283164978, + "num_tokens": 176003377.0, + "step": 6799 + }, + { + "epoch": 0.7467603777728969, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.1327109336853027, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7062594890594482, + "num_tokens": 176030427.0, + "step": 6800 + }, + { + "epoch": 0.7468701954755107, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.182509660720825, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6840392351150513, + "num_tokens": 176058425.0, + "step": 6801 + }, + { + "epoch": 0.7469800131781243, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.360048294067383, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7308163642883301, + "num_tokens": 176081969.0, + "step": 6802 + }, + { + "epoch": 0.747089830880738, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3214595317840576, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7161107063293457, + "num_tokens": 176106404.0, + "step": 6803 + }, + { + "epoch": 0.7471996485833516, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.211570978164673, + "learning_rate": 1e-06, + "loss": 1.1059, + "mean_token_accuracy": 0.6689703464508057, + "num_tokens": 176137354.0, + "step": 6804 + }, + { + "epoch": 0.7473094662859653, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.331861972808838, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7014460563659668, + "num_tokens": 176162288.0, + "step": 6805 + }, + { + "epoch": 0.7474192839885789, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.2941205501556396, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6981753706932068, + "num_tokens": 176189523.0, + "step": 6806 + }, + { + "epoch": 0.7475291016911926, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.4823648929595947, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7038886547088623, + "num_tokens": 176213616.0, + "step": 6807 + }, + { + "epoch": 0.7476389193938063, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.365539312362671, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7200186848640442, + "num_tokens": 176236767.0, + "step": 6808 + }, + { + "epoch": 0.74774873709642, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.266101598739624, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7206931114196777, + "num_tokens": 176263344.0, + "step": 6809 + }, + { + "epoch": 0.7478585547990336, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3901357650756836, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7023380398750305, + "num_tokens": 176288047.0, + "step": 6810 + }, + { + "epoch": 0.7479683725016473, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.20859694480896, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6979435086250305, + "num_tokens": 176317781.0, + "step": 6811 + }, + { + "epoch": 0.7480781902042609, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.310784339904785, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6798677444458008, + "num_tokens": 176347624.0, + "step": 6812 + }, + { + "epoch": 0.7481880079068746, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.6306357383728027, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7141784429550171, + "num_tokens": 176367772.0, + "step": 6813 + }, + { + "epoch": 0.7482978256094882, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.1753058433532715, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7084968686103821, + "num_tokens": 176400767.0, + "step": 6814 + }, + { + "epoch": 0.7484076433121019, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.347618579864502, + "learning_rate": 1e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6837669610977173, + "num_tokens": 176425357.0, + "step": 6815 + }, + { + "epoch": 0.7485174610147156, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.629976987838745, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6932153701782227, + "num_tokens": 176445214.0, + "step": 6816 + }, + { + "epoch": 0.7486272787173293, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.362447500228882, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.7047344446182251, + "num_tokens": 176472967.0, + "step": 6817 + }, + { + "epoch": 0.7487370964199429, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.294923782348633, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7174241542816162, + "num_tokens": 176502370.0, + "step": 6818 + }, + { + "epoch": 0.7488469141225566, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.4684977531433105, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7031980752944946, + "num_tokens": 176525646.0, + "step": 6819 + }, + { + "epoch": 0.7489567318251702, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.24988055229187, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6922131776809692, + "num_tokens": 176552922.0, + "step": 6820 + }, + { + "epoch": 0.7490665495277838, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.2161638736724854, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7072945833206177, + "num_tokens": 176580090.0, + "step": 6821 + }, + { + "epoch": 0.7491763672303975, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.23993182182312, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7054806351661682, + "num_tokens": 176607392.0, + "step": 6822 + }, + { + "epoch": 0.7492861849330112, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.6164143085479736, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7180099487304688, + "num_tokens": 176631020.0, + "step": 6823 + }, + { + "epoch": 0.7493960026356249, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.179858684539795, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7232347726821899, + "num_tokens": 176659765.0, + "step": 6824 + }, + { + "epoch": 0.7495058203382385, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.222386360168457, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7246431112289429, + "num_tokens": 176688642.0, + "step": 6825 + }, + { + "epoch": 0.7496156380408522, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.422276735305786, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7292155027389526, + "num_tokens": 176712306.0, + "step": 6826 + }, + { + "epoch": 0.7497254557434658, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.5508673191070557, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7289931774139404, + "num_tokens": 176731830.0, + "step": 6827 + }, + { + "epoch": 0.7498352734460795, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.204050064086914, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6988421678543091, + "num_tokens": 176762912.0, + "step": 6828 + }, + { + "epoch": 0.7499450911486931, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.350766897201538, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6998488306999207, + "num_tokens": 176787808.0, + "step": 6829 + }, + { + "epoch": 0.7500549088513069, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.5526509284973145, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7094442844390869, + "num_tokens": 176809620.0, + "step": 6830 + }, + { + "epoch": 0.7501647265539205, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.146862745285034, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6981038451194763, + "num_tokens": 176838777.0, + "step": 6831 + }, + { + "epoch": 0.7502745442565342, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.729893207550049, + "learning_rate": 1e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.6814039945602417, + "num_tokens": 176863801.0, + "step": 6832 + }, + { + "epoch": 0.7503843619591478, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.2745513916015625, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7063047885894775, + "num_tokens": 176892098.0, + "step": 6833 + }, + { + "epoch": 0.7504941796617615, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.178901433944702, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7180580496788025, + "num_tokens": 176919806.0, + "step": 6834 + }, + { + "epoch": 0.7506039973643751, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3791840076446533, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6821064352989197, + "num_tokens": 176944799.0, + "step": 6835 + }, + { + "epoch": 0.7507138150669888, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.6150760650634766, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6945024132728577, + "num_tokens": 176968499.0, + "step": 6836 + }, + { + "epoch": 0.7508236327696025, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3938610553741455, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7099423408508301, + "num_tokens": 176992760.0, + "step": 6837 + }, + { + "epoch": 0.7509334504722162, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.294923782348633, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6925122737884521, + "num_tokens": 177019679.0, + "step": 6838 + }, + { + "epoch": 0.7510432681748298, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3623595237731934, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6924238801002502, + "num_tokens": 177048441.0, + "step": 6839 + }, + { + "epoch": 0.7511530858774434, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.228609561920166, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7029539942741394, + "num_tokens": 177076460.0, + "step": 6840 + }, + { + "epoch": 0.7512629035800571, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.477027416229248, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7005879878997803, + "num_tokens": 177099730.0, + "step": 6841 + }, + { + "epoch": 0.7513727212826707, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.5317437648773193, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6997944116592407, + "num_tokens": 177122154.0, + "step": 6842 + }, + { + "epoch": 0.7514825389852844, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.5089426040649414, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7110178470611572, + "num_tokens": 177142945.0, + "step": 6843 + }, + { + "epoch": 0.7515923566878981, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.294314384460449, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.700207531452179, + "num_tokens": 177170514.0, + "step": 6844 + }, + { + "epoch": 0.7517021743905118, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.328115463256836, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6961257457733154, + "num_tokens": 177196408.0, + "step": 6845 + }, + { + "epoch": 0.7518119920931254, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5105133056640625, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7206963300704956, + "num_tokens": 177220300.0, + "step": 6846 + }, + { + "epoch": 0.7519218097957391, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.13087797164917, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6931556463241577, + "num_tokens": 177251674.0, + "step": 6847 + }, + { + "epoch": 0.7520316274983527, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.084336996078491, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6953428983688354, + "num_tokens": 177284386.0, + "step": 6848 + }, + { + "epoch": 0.7521414452009664, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.0715408325195312, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6946271657943726, + "num_tokens": 177314575.0, + "step": 6849 + }, + { + "epoch": 0.75225126290358, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.294843912124634, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6901364922523499, + "num_tokens": 177340530.0, + "step": 6850 + }, + { + "epoch": 0.7523610806061937, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.525514841079712, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7007207870483398, + "num_tokens": 177363114.0, + "step": 6851 + }, + { + "epoch": 0.7524708983088074, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2012319564819336, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7019307613372803, + "num_tokens": 177392813.0, + "step": 6852 + }, + { + "epoch": 0.7525807160114211, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4838168621063232, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7056963443756104, + "num_tokens": 177416349.0, + "step": 6853 + }, + { + "epoch": 0.7526905337140347, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3924901485443115, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6829907298088074, + "num_tokens": 177441313.0, + "step": 6854 + }, + { + "epoch": 0.7528003514166484, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.70914888381958, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7433872222900391, + "num_tokens": 177462066.0, + "step": 6855 + }, + { + "epoch": 0.752910169119262, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2997443675994873, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7049012184143066, + "num_tokens": 177488623.0, + "step": 6856 + }, + { + "epoch": 0.7530199868218757, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 6.972487449645996, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7067549228668213, + "num_tokens": 177517433.0, + "step": 6857 + }, + { + "epoch": 0.7531298045244893, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.372110605239868, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6986010074615479, + "num_tokens": 177546971.0, + "step": 6858 + }, + { + "epoch": 0.7532396222271031, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.6172430515289307, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6859596967697144, + "num_tokens": 177568348.0, + "step": 6859 + }, + { + "epoch": 0.7533494399297167, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.415666103363037, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7089418768882751, + "num_tokens": 177595015.0, + "step": 6860 + }, + { + "epoch": 0.7534592576323303, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3116397857666016, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.707827627658844, + "num_tokens": 177621428.0, + "step": 6861 + }, + { + "epoch": 0.753569075334944, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2998135089874268, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7086813449859619, + "num_tokens": 177648253.0, + "step": 6862 + }, + { + "epoch": 0.7536788930375576, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.831717014312744, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7308189868927002, + "num_tokens": 177666539.0, + "step": 6863 + }, + { + "epoch": 0.7537887107401713, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4951417446136475, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7156109809875488, + "num_tokens": 177691319.0, + "step": 6864 + }, + { + "epoch": 0.7538985284427849, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1670284271240234, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6992316246032715, + "num_tokens": 177721356.0, + "step": 6865 + }, + { + "epoch": 0.7540083461453987, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1147735118865967, + "learning_rate": 1e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6825985908508301, + "num_tokens": 177753114.0, + "step": 6866 + }, + { + "epoch": 0.7541181638480123, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.298288345336914, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7223078608512878, + "num_tokens": 177777685.0, + "step": 6867 + }, + { + "epoch": 0.754227981550626, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.293865203857422, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7063443660736084, + "num_tokens": 177805844.0, + "step": 6868 + }, + { + "epoch": 0.7543377992532396, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.473409414291382, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.693681001663208, + "num_tokens": 177835153.0, + "step": 6869 + }, + { + "epoch": 0.7544476169558533, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2379236221313477, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7099871635437012, + "num_tokens": 177861316.0, + "step": 6870 + }, + { + "epoch": 0.7545574346584669, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.365501642227173, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.708566427230835, + "num_tokens": 177888119.0, + "step": 6871 + }, + { + "epoch": 0.7546672523610806, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1930694580078125, + "learning_rate": 1e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6809035539627075, + "num_tokens": 177920001.0, + "step": 6872 + }, + { + "epoch": 0.7547770700636943, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.816873550415039, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7092214822769165, + "num_tokens": 177938096.0, + "step": 6873 + }, + { + "epoch": 0.754886887766308, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4071385860443115, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6980631351470947, + "num_tokens": 177961331.0, + "step": 6874 + }, + { + "epoch": 0.7549967054689216, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.289059638977051, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7254863977432251, + "num_tokens": 177985902.0, + "step": 6875 + }, + { + "epoch": 0.7551065231715353, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5058131217956543, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6853790879249573, + "num_tokens": 178011077.0, + "step": 6876 + }, + { + "epoch": 0.7552163408741489, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5276107788085938, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7163763642311096, + "num_tokens": 178033075.0, + "step": 6877 + }, + { + "epoch": 0.7553261585767626, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.246670722961426, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7011697888374329, + "num_tokens": 178060752.0, + "step": 6878 + }, + { + "epoch": 0.7554359762793762, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2341361045837402, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7228949666023254, + "num_tokens": 178089622.0, + "step": 6879 + }, + { + "epoch": 0.7555457939819898, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1476736068725586, + "learning_rate": 1e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.6754864454269409, + "num_tokens": 178120948.0, + "step": 6880 + }, + { + "epoch": 0.7556556116846036, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.732848882675171, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.689116358757019, + "num_tokens": 178143058.0, + "step": 6881 + }, + { + "epoch": 0.7557654293872172, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3745687007904053, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7006155848503113, + "num_tokens": 178166367.0, + "step": 6882 + }, + { + "epoch": 0.7558752470898309, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.756234645843506, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7121635675430298, + "num_tokens": 178185632.0, + "step": 6883 + }, + { + "epoch": 0.7559850647924445, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3446083068847656, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6930352449417114, + "num_tokens": 178210509.0, + "step": 6884 + }, + { + "epoch": 0.7560948824950582, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.135291337966919, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6914666891098022, + "num_tokens": 178246098.0, + "step": 6885 + }, + { + "epoch": 0.7562047001976718, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4014928340911865, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.7025985717773438, + "num_tokens": 178269653.0, + "step": 6886 + }, + { + "epoch": 0.7563145179002855, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.420715093612671, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7023667097091675, + "num_tokens": 178297507.0, + "step": 6887 + }, + { + "epoch": 0.7564243356028992, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2614991664886475, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7104767560958862, + "num_tokens": 178322444.0, + "step": 6888 + }, + { + "epoch": 0.7565341533055129, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.588491439819336, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.688091516494751, + "num_tokens": 178345073.0, + "step": 6889 + }, + { + "epoch": 0.7566439710081265, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.257157802581787, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6929185390472412, + "num_tokens": 178373921.0, + "step": 6890 + }, + { + "epoch": 0.7567537887107402, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2845633029937744, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7031782269477844, + "num_tokens": 178401628.0, + "step": 6891 + }, + { + "epoch": 0.7568636064133538, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4877777099609375, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7298258543014526, + "num_tokens": 178424561.0, + "step": 6892 + }, + { + "epoch": 0.7569734241159675, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2789719104766846, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7280217409133911, + "num_tokens": 178447980.0, + "step": 6893 + }, + { + "epoch": 0.7570832418185811, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.359368324279785, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7150387763977051, + "num_tokens": 178471718.0, + "step": 6894 + }, + { + "epoch": 0.7571930595211949, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3345766067504883, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7000052332878113, + "num_tokens": 178496847.0, + "step": 6895 + }, + { + "epoch": 0.7573028772238085, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.17441463470459, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.688559889793396, + "num_tokens": 178527000.0, + "step": 6896 + }, + { + "epoch": 0.7574126949264222, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.464244842529297, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.7014527320861816, + "num_tokens": 178550133.0, + "step": 6897 + }, + { + "epoch": 0.7575225126290358, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4242758750915527, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6985890865325928, + "num_tokens": 178577050.0, + "step": 6898 + }, + { + "epoch": 0.7576323303316495, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.267242431640625, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7222598791122437, + "num_tokens": 178602725.0, + "step": 6899 + }, + { + "epoch": 0.7577421480342631, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.182426691055298, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.711037814617157, + "num_tokens": 178633785.0, + "step": 6900 + }, + { + "epoch": 0.7578519657368767, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.670616626739502, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7043567299842834, + "num_tokens": 178654144.0, + "step": 6901 + }, + { + "epoch": 0.7579617834394905, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3652901649475098, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6986436247825623, + "num_tokens": 178680025.0, + "step": 6902 + }, + { + "epoch": 0.7580716011421041, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.593202590942383, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6999502182006836, + "num_tokens": 178702119.0, + "step": 6903 + }, + { + "epoch": 0.7581814188447178, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 6.907217979431152, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6952817440032959, + "num_tokens": 178731672.0, + "step": 6904 + }, + { + "epoch": 0.7582912365473314, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2929508686065674, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7238296270370483, + "num_tokens": 178758582.0, + "step": 6905 + }, + { + "epoch": 0.7584010542499451, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1356990337371826, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6871450543403625, + "num_tokens": 178787672.0, + "step": 6906 + }, + { + "epoch": 0.7585108719525587, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3712551593780518, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6855635046958923, + "num_tokens": 178814095.0, + "step": 6907 + }, + { + "epoch": 0.7586206896551724, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2290585041046143, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6904268860816956, + "num_tokens": 178843461.0, + "step": 6908 + }, + { + "epoch": 0.758730507357786, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2806546688079834, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.7006228566169739, + "num_tokens": 178870250.0, + "step": 6909 + }, + { + "epoch": 0.7588403250603998, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3305675983428955, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7165588140487671, + "num_tokens": 178893417.0, + "step": 6910 + }, + { + "epoch": 0.7589501427630134, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4754927158355713, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6994755268096924, + "num_tokens": 178915244.0, + "step": 6911 + }, + { + "epoch": 0.7590599604656271, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.422070026397705, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6994876861572266, + "num_tokens": 178938120.0, + "step": 6912 + }, + { + "epoch": 0.7591697781682407, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.337859630584717, + "learning_rate": 1e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.6803946495056152, + "num_tokens": 178965558.0, + "step": 6913 + }, + { + "epoch": 0.7592795958708544, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.215575933456421, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7135087847709656, + "num_tokens": 178994111.0, + "step": 6914 + }, + { + "epoch": 0.759389413573468, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5229811668395996, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7027339935302734, + "num_tokens": 179017532.0, + "step": 6915 + }, + { + "epoch": 0.7594992312760817, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.749377965927124, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7003861665725708, + "num_tokens": 179036660.0, + "step": 6916 + }, + { + "epoch": 0.7596090489786954, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.324734926223755, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7018954753875732, + "num_tokens": 179064753.0, + "step": 6917 + }, + { + "epoch": 0.7597188666813091, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.42270565032959, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7160051465034485, + "num_tokens": 179087979.0, + "step": 6918 + }, + { + "epoch": 0.7598286843839227, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4319727420806885, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.69707852602005, + "num_tokens": 179113525.0, + "step": 6919 + }, + { + "epoch": 0.7599385020865363, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2854700088500977, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7140244245529175, + "num_tokens": 179140003.0, + "step": 6920 + }, + { + "epoch": 0.76004831978915, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2997567653656006, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.72044837474823, + "num_tokens": 179164191.0, + "step": 6921 + }, + { + "epoch": 0.7601581374917636, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2077484130859375, + "learning_rate": 1e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.6829075813293457, + "num_tokens": 179194456.0, + "step": 6922 + }, + { + "epoch": 0.7602679551943773, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3285083770751953, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7263887524604797, + "num_tokens": 179220806.0, + "step": 6923 + }, + { + "epoch": 0.760377772896991, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.588212490081787, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7003399133682251, + "num_tokens": 179243326.0, + "step": 6924 + }, + { + "epoch": 0.7604875905996047, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.287694215774536, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7063239812850952, + "num_tokens": 179270466.0, + "step": 6925 + }, + { + "epoch": 0.7605974083022183, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1406352519989014, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7160050868988037, + "num_tokens": 179300590.0, + "step": 6926 + }, + { + "epoch": 0.760707226004832, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3921475410461426, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6866409778594971, + "num_tokens": 179324837.0, + "step": 6927 + }, + { + "epoch": 0.7608170437074456, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.094203472137451, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6966627240180969, + "num_tokens": 179356085.0, + "step": 6928 + }, + { + "epoch": 0.7609268614100593, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.427711248397827, + "learning_rate": 1e-06, + "loss": 1.1059, + "mean_token_accuracy": 0.6776988506317139, + "num_tokens": 179380943.0, + "step": 6929 + }, + { + "epoch": 0.7610366791126729, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.333361864089966, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6963205933570862, + "num_tokens": 179406636.0, + "step": 6930 + }, + { + "epoch": 0.7611464968152867, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 7.066391468048096, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6951265335083008, + "num_tokens": 179432128.0, + "step": 6931 + }, + { + "epoch": 0.7612563145179003, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3729119300842285, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6993709802627563, + "num_tokens": 179457591.0, + "step": 6932 + }, + { + "epoch": 0.761366132220514, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3988113403320312, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7086829543113708, + "num_tokens": 179483646.0, + "step": 6933 + }, + { + "epoch": 0.7614759499231276, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.216688394546509, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7089454531669617, + "num_tokens": 179511204.0, + "step": 6934 + }, + { + "epoch": 0.7615857676257413, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.503535747528076, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7044913172721863, + "num_tokens": 179536067.0, + "step": 6935 + }, + { + "epoch": 0.7616955853283549, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.167234182357788, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7335368394851685, + "num_tokens": 179563707.0, + "step": 6936 + }, + { + "epoch": 0.7618054030309686, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3512234687805176, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7211698293685913, + "num_tokens": 179587191.0, + "step": 6937 + }, + { + "epoch": 0.7619152207335822, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3080852031707764, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7313944101333618, + "num_tokens": 179610040.0, + "step": 6938 + }, + { + "epoch": 0.762025038436196, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.405965566635132, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.699565052986145, + "num_tokens": 179635594.0, + "step": 6939 + }, + { + "epoch": 0.7621348561388096, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.490227222442627, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7103008031845093, + "num_tokens": 179657892.0, + "step": 6940 + }, + { + "epoch": 0.7622446738414232, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.282240152359009, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7251797914505005, + "num_tokens": 179683394.0, + "step": 6941 + }, + { + "epoch": 0.7623544915440369, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.15627121925354, + "learning_rate": 1e-06, + "loss": 1.0889, + "mean_token_accuracy": 0.6826950907707214, + "num_tokens": 179711403.0, + "step": 6942 + }, + { + "epoch": 0.7624643092466505, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.6962101459503174, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7185204029083252, + "num_tokens": 179730406.0, + "step": 6943 + }, + { + "epoch": 0.7625741269492642, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1925220489501953, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6910516619682312, + "num_tokens": 179761665.0, + "step": 6944 + }, + { + "epoch": 0.7626839446518778, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3995656967163086, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7089405059814453, + "num_tokens": 179784254.0, + "step": 6945 + }, + { + "epoch": 0.7627937623544916, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.616925001144409, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7010653614997864, + "num_tokens": 179807077.0, + "step": 6946 + }, + { + "epoch": 0.7629035800571052, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.029536485671997, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7146990299224854, + "num_tokens": 179838235.0, + "step": 6947 + }, + { + "epoch": 0.7630133977597189, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2051491737365723, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7039204835891724, + "num_tokens": 179866103.0, + "step": 6948 + }, + { + "epoch": 0.7631232154623325, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4743099212646484, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7172846794128418, + "num_tokens": 179888382.0, + "step": 6949 + }, + { + "epoch": 0.7632330331649462, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5328543186187744, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7098413109779358, + "num_tokens": 179908263.0, + "step": 6950 + }, + { + "epoch": 0.7633428508675598, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.306894540786743, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7126919031143188, + "num_tokens": 179935433.0, + "step": 6951 + }, + { + "epoch": 0.7634526685701735, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.336909055709839, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.697551429271698, + "num_tokens": 179961446.0, + "step": 6952 + }, + { + "epoch": 0.7635624862727872, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2888600826263428, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7170716524124146, + "num_tokens": 179987906.0, + "step": 6953 + }, + { + "epoch": 0.7636723039754009, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.4353604316711426, + "learning_rate": 1e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6723202466964722, + "num_tokens": 180011551.0, + "step": 6954 + }, + { + "epoch": 0.7637821216780145, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.272597074508667, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.714370608329773, + "num_tokens": 180037809.0, + "step": 6955 + }, + { + "epoch": 0.7638919393806282, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3145267963409424, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7110010385513306, + "num_tokens": 180063827.0, + "step": 6956 + }, + { + "epoch": 0.7640017570832418, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3788836002349854, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6907951831817627, + "num_tokens": 180089028.0, + "step": 6957 + }, + { + "epoch": 0.7641115747858555, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.499422788619995, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7087600827217102, + "num_tokens": 180113074.0, + "step": 6958 + }, + { + "epoch": 0.7642213924884691, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.400522232055664, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6902238130569458, + "num_tokens": 180140288.0, + "step": 6959 + }, + { + "epoch": 0.7643312101910829, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.340557813644409, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6912369132041931, + "num_tokens": 180165848.0, + "step": 6960 + }, + { + "epoch": 0.7644410278936965, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.1888391971588135, + "learning_rate": 1e-06, + "loss": 1.104, + "mean_token_accuracy": 0.6817324757575989, + "num_tokens": 180193944.0, + "step": 6961 + }, + { + "epoch": 0.7645508455963101, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.4126155376434326, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7022693753242493, + "num_tokens": 180217722.0, + "step": 6962 + }, + { + "epoch": 0.7646606632989238, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2843408584594727, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7212435603141785, + "num_tokens": 180242650.0, + "step": 6963 + }, + { + "epoch": 0.7647704810015374, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.044126510620117, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7322374582290649, + "num_tokens": 180272453.0, + "step": 6964 + }, + { + "epoch": 0.7648802987041511, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.169914484024048, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7148312330245972, + "num_tokens": 180300422.0, + "step": 6965 + }, + { + "epoch": 0.7649901164067647, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.510817289352417, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7265653610229492, + "num_tokens": 180322956.0, + "step": 6966 + }, + { + "epoch": 0.7650999341093784, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.2770192623138428, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7004228830337524, + "num_tokens": 180348589.0, + "step": 6967 + }, + { + "epoch": 0.7652097518119921, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.6913795471191406, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7026857733726501, + "num_tokens": 180369574.0, + "step": 6968 + }, + { + "epoch": 0.7653195695146058, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5279541015625, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.7016092538833618, + "num_tokens": 180391533.0, + "step": 6969 + }, + { + "epoch": 0.7654293872172194, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3460395336151123, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6841422319412231, + "num_tokens": 180419677.0, + "step": 6970 + }, + { + "epoch": 0.7655392049198331, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.255810260772705, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6861301064491272, + "num_tokens": 180449792.0, + "step": 6971 + }, + { + "epoch": 0.7656490226224467, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.595989227294922, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7042653560638428, + "num_tokens": 180470960.0, + "step": 6972 + }, + { + "epoch": 0.7657588403250604, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.197147846221924, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6781966686248779, + "num_tokens": 180501704.0, + "step": 6973 + }, + { + "epoch": 0.765868658027674, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.5925779342651367, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7020738124847412, + "num_tokens": 180522239.0, + "step": 6974 + }, + { + "epoch": 0.7659784757302878, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.401134967803955, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.7037162780761719, + "num_tokens": 180545659.0, + "step": 6975 + }, + { + "epoch": 0.7660882934329014, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.2258710861206055, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7231090068817139, + "num_tokens": 180570244.0, + "step": 6976 + }, + { + "epoch": 0.7661981111355151, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 2.3819406032562256, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7188906669616699, + "num_tokens": 180597361.0, + "step": 6977 + }, + { + "epoch": 0.7663079288381287, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.355315685272217, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6995857954025269, + "num_tokens": 180622400.0, + "step": 6978 + }, + { + "epoch": 0.7664177465407424, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.2801856994628906, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7067040801048279, + "num_tokens": 180649632.0, + "step": 6979 + }, + { + "epoch": 0.766527564243356, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.169996738433838, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7148841619491577, + "num_tokens": 180678695.0, + "step": 6980 + }, + { + "epoch": 0.7666373819459696, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.675948143005371, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7195471525192261, + "num_tokens": 180700718.0, + "step": 6981 + }, + { + "epoch": 0.7667471996485834, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3882434368133545, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7285052537918091, + "num_tokens": 180723553.0, + "step": 6982 + }, + { + "epoch": 0.766857017351197, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.352795362472534, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7110246419906616, + "num_tokens": 180748214.0, + "step": 6983 + }, + { + "epoch": 0.7669668350538107, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.335627555847168, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7022812366485596, + "num_tokens": 180773817.0, + "step": 6984 + }, + { + "epoch": 0.7670766527564243, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.6801185607910156, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7070218324661255, + "num_tokens": 180794784.0, + "step": 6985 + }, + { + "epoch": 0.767186470459038, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.5946316719055176, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6984944939613342, + "num_tokens": 180815988.0, + "step": 6986 + }, + { + "epoch": 0.7672962881616516, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.231121301651001, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6963067650794983, + "num_tokens": 180842777.0, + "step": 6987 + }, + { + "epoch": 0.7674061058642653, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3479366302490234, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.71058189868927, + "num_tokens": 180868011.0, + "step": 6988 + }, + { + "epoch": 0.767515923566879, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.113142967224121, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6893689632415771, + "num_tokens": 180899163.0, + "step": 6989 + }, + { + "epoch": 0.7676257412694927, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3192648887634277, + "learning_rate": 1e-06, + "loss": 1.1265, + "mean_token_accuracy": 0.6701518297195435, + "num_tokens": 180926392.0, + "step": 6990 + }, + { + "epoch": 0.7677355589721063, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.327519178390503, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6949892640113831, + "num_tokens": 180951585.0, + "step": 6991 + }, + { + "epoch": 0.76784537667472, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3667349815368652, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7048738598823547, + "num_tokens": 180975363.0, + "step": 6992 + }, + { + "epoch": 0.7679551943773336, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.25534987449646, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6881223320960999, + "num_tokens": 181003116.0, + "step": 6993 + }, + { + "epoch": 0.7680650120799473, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.285898447036743, + "learning_rate": 1e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6890782713890076, + "num_tokens": 181029285.0, + "step": 6994 + }, + { + "epoch": 0.7681748297825609, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3201088905334473, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7103686332702637, + "num_tokens": 181053653.0, + "step": 6995 + }, + { + "epoch": 0.7682846474851747, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3513388633728027, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.7047619223594666, + "num_tokens": 181079899.0, + "step": 6996 + }, + { + "epoch": 0.7683944651877883, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.37524676322937, + "learning_rate": 1e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6920303106307983, + "num_tokens": 181105803.0, + "step": 6997 + }, + { + "epoch": 0.768504282890402, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3195433616638184, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7038053274154663, + "num_tokens": 181132456.0, + "step": 6998 + }, + { + "epoch": 0.7686141005930156, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.5560972690582275, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6921160221099854, + "num_tokens": 181155100.0, + "step": 6999 + }, + { + "epoch": 0.7687239182956292, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.6123414039611816, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7044139504432678, + "num_tokens": 181176842.0, + "step": 7000 + }, + { + "epoch": 0.7688337359982429, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.4335920810699463, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6983864903450012, + "num_tokens": 181200229.0, + "step": 7001 + }, + { + "epoch": 0.7689435537008565, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.1860175132751465, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.7014120817184448, + "num_tokens": 181229968.0, + "step": 7002 + }, + { + "epoch": 0.7690533714034702, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.41336727142334, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6946675181388855, + "num_tokens": 181256302.0, + "step": 7003 + }, + { + "epoch": 0.7691631891060839, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.200683355331421, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7290582656860352, + "num_tokens": 181286191.0, + "step": 7004 + }, + { + "epoch": 0.7692730068086976, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.6299285888671875, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7015883326530457, + "num_tokens": 181307029.0, + "step": 7005 + }, + { + "epoch": 0.7693828245113112, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3265795707702637, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7038426995277405, + "num_tokens": 181332716.0, + "step": 7006 + }, + { + "epoch": 0.7694926422139249, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.1817667484283447, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7049055099487305, + "num_tokens": 181361598.0, + "step": 7007 + }, + { + "epoch": 0.7696024599165385, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.439643621444702, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6880512237548828, + "num_tokens": 181385434.0, + "step": 7008 + }, + { + "epoch": 0.7697122776191522, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.3826663494110107, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.694858729839325, + "num_tokens": 181411830.0, + "step": 7009 + }, + { + "epoch": 0.7698220953217658, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.2214531898498535, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.698599100112915, + "num_tokens": 181442415.0, + "step": 7010 + }, + { + "epoch": 0.7699319130243796, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.4144153594970703, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7006515264511108, + "num_tokens": 181466967.0, + "step": 7011 + }, + { + "epoch": 0.7700417307269932, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.659205436706543, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7176003456115723, + "num_tokens": 181487298.0, + "step": 7012 + }, + { + "epoch": 0.7701515484296069, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.590752363204956, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7354069352149963, + "num_tokens": 181507506.0, + "step": 7013 + }, + { + "epoch": 0.7702613661322205, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.2209949493408203, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7017304301261902, + "num_tokens": 181536470.0, + "step": 7014 + }, + { + "epoch": 0.7703711838348342, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.412346124649048, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7040988802909851, + "num_tokens": 181562596.0, + "step": 7015 + }, + { + "epoch": 0.7704810015374478, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.292598009109497, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6869391202926636, + "num_tokens": 181591441.0, + "step": 7016 + }, + { + "epoch": 0.7705908192400615, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3791863918304443, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6966163516044617, + "num_tokens": 181617441.0, + "step": 7017 + }, + { + "epoch": 0.7707006369426752, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.291067600250244, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7098803520202637, + "num_tokens": 181644105.0, + "step": 7018 + }, + { + "epoch": 0.7708104546452889, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.297022819519043, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.711086630821228, + "num_tokens": 181669368.0, + "step": 7019 + }, + { + "epoch": 0.7709202723479025, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.613431692123413, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7016699910163879, + "num_tokens": 181690185.0, + "step": 7020 + }, + { + "epoch": 0.7710300900505161, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2475953102111816, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7148216962814331, + "num_tokens": 181717010.0, + "step": 7021 + }, + { + "epoch": 0.7711399077531298, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3648667335510254, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6868183016777039, + "num_tokens": 181742561.0, + "step": 7022 + }, + { + "epoch": 0.7712497254557434, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.423234462738037, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6885627508163452, + "num_tokens": 181766667.0, + "step": 7023 + }, + { + "epoch": 0.7713595431583571, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.3466567993164062, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7285383939743042, + "num_tokens": 181792346.0, + "step": 7024 + }, + { + "epoch": 0.7714693608609708, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.4774181842803955, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6803133487701416, + "num_tokens": 181817478.0, + "step": 7025 + }, + { + "epoch": 0.7715791785635845, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4770073890686035, + "learning_rate": 1e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6921903491020203, + "num_tokens": 181841823.0, + "step": 7026 + }, + { + "epoch": 0.7716889962661981, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5294575691223145, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.697440505027771, + "num_tokens": 181863086.0, + "step": 7027 + }, + { + "epoch": 0.7717988139688118, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4132091999053955, + "learning_rate": 1e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.681640088558197, + "num_tokens": 181888119.0, + "step": 7028 + }, + { + "epoch": 0.7719086316714254, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2566628456115723, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6901454925537109, + "num_tokens": 181914649.0, + "step": 7029 + }, + { + "epoch": 0.7720184493740391, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3284101486206055, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6962185502052307, + "num_tokens": 181940137.0, + "step": 7030 + }, + { + "epoch": 0.7721282670766527, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3642935752868652, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7241736054420471, + "num_tokens": 181963552.0, + "step": 7031 + }, + { + "epoch": 0.7722380847792664, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5427913665771484, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7342718839645386, + "num_tokens": 181983514.0, + "step": 7032 + }, + { + "epoch": 0.7723479024818801, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2257447242736816, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7061924338340759, + "num_tokens": 182012525.0, + "step": 7033 + }, + { + "epoch": 0.7724577201844938, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.4752955436706543, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6879363059997559, + "num_tokens": 182036340.0, + "step": 7034 + }, + { + "epoch": 0.7725675378871074, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.707977056503296, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7129622101783752, + "num_tokens": 182055692.0, + "step": 7035 + }, + { + "epoch": 0.7726773555897211, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1737465858459473, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7066946029663086, + "num_tokens": 182084359.0, + "step": 7036 + }, + { + "epoch": 0.7727871732923347, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5185680389404297, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6868373155593872, + "num_tokens": 182108290.0, + "step": 7037 + }, + { + "epoch": 0.7728969909949484, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.151406764984131, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6871492862701416, + "num_tokens": 182137242.0, + "step": 7038 + }, + { + "epoch": 0.773006808697562, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.278351068496704, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6887325048446655, + "num_tokens": 182166149.0, + "step": 7039 + }, + { + "epoch": 0.7731166264001758, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3885130882263184, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7128044366836548, + "num_tokens": 182189087.0, + "step": 7040 + }, + { + "epoch": 0.7732264441027894, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4519131183624268, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6915705800056458, + "num_tokens": 182213522.0, + "step": 7041 + }, + { + "epoch": 0.773336261805403, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.482767343521118, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.684411883354187, + "num_tokens": 182236627.0, + "step": 7042 + }, + { + "epoch": 0.7734460795080167, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 2.528581142425537, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7262670993804932, + "num_tokens": 182259748.0, + "step": 7043 + }, + { + "epoch": 0.7735558972106303, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3043053150177, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7018012404441833, + "num_tokens": 182288135.0, + "step": 7044 + }, + { + "epoch": 0.773665714913244, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2858402729034424, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6919932961463928, + "num_tokens": 182313404.0, + "step": 7045 + }, + { + "epoch": 0.7737755326158576, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.171290636062622, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7101566195487976, + "num_tokens": 182342822.0, + "step": 7046 + }, + { + "epoch": 0.7738853503184714, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.766026258468628, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7249336838722229, + "num_tokens": 182360758.0, + "step": 7047 + }, + { + "epoch": 0.773995168021085, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.0283167362213135, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6920535564422607, + "num_tokens": 182393280.0, + "step": 7048 + }, + { + "epoch": 0.7741049857236987, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.522833824157715, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.7006618976593018, + "num_tokens": 182417912.0, + "step": 7049 + }, + { + "epoch": 0.7742148034263123, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.406461715698242, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6770815849304199, + "num_tokens": 182442234.0, + "step": 7050 + }, + { + "epoch": 0.774324621128926, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3391480445861816, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7149516344070435, + "num_tokens": 182466737.0, + "step": 7051 + }, + { + "epoch": 0.7744344388315396, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.240750551223755, + "learning_rate": 1e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6868157386779785, + "num_tokens": 182494786.0, + "step": 7052 + }, + { + "epoch": 0.7745442565341533, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.392225980758667, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7130272388458252, + "num_tokens": 182517728.0, + "step": 7053 + }, + { + "epoch": 0.774654074236767, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 3.9747679233551025, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7210205793380737, + "num_tokens": 182539508.0, + "step": 7054 + }, + { + "epoch": 0.7747638919393807, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3185908794403076, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6905648112297058, + "num_tokens": 182565504.0, + "step": 7055 + }, + { + "epoch": 0.7748737096419943, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.2507784366607666, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6931254863739014, + "num_tokens": 182593815.0, + "step": 7056 + }, + { + "epoch": 0.774983527344608, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 6.995693683624268, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6990912556648254, + "num_tokens": 182621249.0, + "step": 7057 + }, + { + "epoch": 0.7750933450472216, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4554147720336914, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7209134101867676, + "num_tokens": 182645287.0, + "step": 7058 + }, + { + "epoch": 0.7752031627498353, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4011991024017334, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6942917704582214, + "num_tokens": 182670630.0, + "step": 7059 + }, + { + "epoch": 0.7753129804524489, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.416910171508789, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7255029678344727, + "num_tokens": 182693031.0, + "step": 7060 + }, + { + "epoch": 0.7754227981550625, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.228740692138672, + "learning_rate": 1e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.6760038137435913, + "num_tokens": 182720336.0, + "step": 7061 + }, + { + "epoch": 0.7755326158576763, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.293182373046875, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7022891640663147, + "num_tokens": 182744926.0, + "step": 7062 + }, + { + "epoch": 0.77564243356029, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3264663219451904, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6811025738716125, + "num_tokens": 182771478.0, + "step": 7063 + }, + { + "epoch": 0.7757522512629036, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5239739418029785, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6997907757759094, + "num_tokens": 182795271.0, + "step": 7064 + }, + { + "epoch": 0.7758620689655172, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.333599328994751, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6909786462783813, + "num_tokens": 182820752.0, + "step": 7065 + }, + { + "epoch": 0.7759718866681309, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 3.7463746070861816, + "learning_rate": 1e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6875169277191162, + "num_tokens": 182850623.0, + "step": 7066 + }, + { + "epoch": 0.7760817043707445, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3512420654296875, + "learning_rate": 1e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.682695746421814, + "num_tokens": 182876812.0, + "step": 7067 + }, + { + "epoch": 0.7761915220733582, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3524928092956543, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7231414914131165, + "num_tokens": 182901447.0, + "step": 7068 + }, + { + "epoch": 0.7763013397759719, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.735051393508911, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6944375038146973, + "num_tokens": 182922621.0, + "step": 7069 + }, + { + "epoch": 0.7764111574785856, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.384094476699829, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7164878845214844, + "num_tokens": 182945907.0, + "step": 7070 + }, + { + "epoch": 0.7765209751811992, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.513596534729004, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.705707311630249, + "num_tokens": 182967961.0, + "step": 7071 + }, + { + "epoch": 0.7766307928838129, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.399089813232422, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6989439725875854, + "num_tokens": 182997028.0, + "step": 7072 + }, + { + "epoch": 0.7767406105864265, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.410862445831299, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7157050371170044, + "num_tokens": 183020275.0, + "step": 7073 + }, + { + "epoch": 0.7768504282890402, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5441908836364746, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.706005334854126, + "num_tokens": 183042996.0, + "step": 7074 + }, + { + "epoch": 0.7769602459916538, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.4978034496307373, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7023886442184448, + "num_tokens": 183069469.0, + "step": 7075 + }, + { + "epoch": 0.7770700636942676, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.385754346847534, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6981853246688843, + "num_tokens": 183094259.0, + "step": 7076 + }, + { + "epoch": 0.7771798813968812, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.469120979309082, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.732241153717041, + "num_tokens": 183114980.0, + "step": 7077 + }, + { + "epoch": 0.7772896990994949, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.440732002258301, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7047910094261169, + "num_tokens": 183140267.0, + "step": 7078 + }, + { + "epoch": 0.7773995168021085, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.411980390548706, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7158777713775635, + "num_tokens": 183165152.0, + "step": 7079 + }, + { + "epoch": 0.7775093345047221, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.191784620285034, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7095820903778076, + "num_tokens": 183192657.0, + "step": 7080 + }, + { + "epoch": 0.7776191522073358, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1421117782592773, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7332855463027954, + "num_tokens": 183221786.0, + "step": 7081 + }, + { + "epoch": 0.7777289699099494, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.181205987930298, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7254465818405151, + "num_tokens": 183248432.0, + "step": 7082 + }, + { + "epoch": 0.7778387876125632, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2319538593292236, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7107929587364197, + "num_tokens": 183276389.0, + "step": 7083 + }, + { + "epoch": 0.7779486053151768, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.387298345565796, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7099912166595459, + "num_tokens": 183302609.0, + "step": 7084 + }, + { + "epoch": 0.7780584230177905, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1163601875305176, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6930147409439087, + "num_tokens": 183336102.0, + "step": 7085 + }, + { + "epoch": 0.7781682407204041, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1391749382019043, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6949460506439209, + "num_tokens": 183365553.0, + "step": 7086 + }, + { + "epoch": 0.7782780584230178, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.303922653198242, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6938830614089966, + "num_tokens": 183391577.0, + "step": 7087 + }, + { + "epoch": 0.7783878761256314, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.314863681793213, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6925448179244995, + "num_tokens": 183416801.0, + "step": 7088 + }, + { + "epoch": 0.7784976938282451, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 2.7178776264190674, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7332622408866882, + "num_tokens": 183435573.0, + "step": 7089 + }, + { + "epoch": 0.7786075115308587, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.7740793228149414, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6884165406227112, + "num_tokens": 183467300.0, + "step": 7090 + }, + { + "epoch": 0.7787173292334725, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4591493606567383, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7034253478050232, + "num_tokens": 183490195.0, + "step": 7091 + }, + { + "epoch": 0.7788271469360861, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.480541467666626, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6992061138153076, + "num_tokens": 183514431.0, + "step": 7092 + }, + { + "epoch": 0.7789369646386998, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3331336975097656, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7392591238021851, + "num_tokens": 183539682.0, + "step": 7093 + }, + { + "epoch": 0.7790467823413134, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5739455223083496, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7148469686508179, + "num_tokens": 183562472.0, + "step": 7094 + }, + { + "epoch": 0.7791566000439271, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.326859474182129, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7035401463508606, + "num_tokens": 183589502.0, + "step": 7095 + }, + { + "epoch": 0.7792664177465407, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.7615132331848145, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6964595317840576, + "num_tokens": 183615986.0, + "step": 7096 + }, + { + "epoch": 0.7793762354491544, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.233750820159912, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.691405713558197, + "num_tokens": 183642905.0, + "step": 7097 + }, + { + "epoch": 0.7794860531517681, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.609184741973877, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7145572304725647, + "num_tokens": 183664664.0, + "step": 7098 + }, + { + "epoch": 0.7795958708543818, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3715298175811768, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7255369424819946, + "num_tokens": 183688895.0, + "step": 7099 + }, + { + "epoch": 0.7797056885569954, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2981324195861816, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6957932114601135, + "num_tokens": 183716026.0, + "step": 7100 + }, + { + "epoch": 0.779815506259609, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1250524520874023, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7060179710388184, + "num_tokens": 183744988.0, + "step": 7101 + }, + { + "epoch": 0.7799253239622227, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3506364822387695, + "learning_rate": 1e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.6766179800033569, + "num_tokens": 183771237.0, + "step": 7102 + }, + { + "epoch": 0.7800351416648363, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2689380645751953, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6980414390563965, + "num_tokens": 183798604.0, + "step": 7103 + }, + { + "epoch": 0.78014495936745, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5915069580078125, + "learning_rate": 1e-06, + "loss": 1.092, + "mean_token_accuracy": 0.6781436204910278, + "num_tokens": 183824031.0, + "step": 7104 + }, + { + "epoch": 0.7802547770700637, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.493032217025757, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7073254585266113, + "num_tokens": 183847258.0, + "step": 7105 + }, + { + "epoch": 0.7803645947726774, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2996883392333984, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7152489423751831, + "num_tokens": 183872247.0, + "step": 7106 + }, + { + "epoch": 0.780474412475291, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3650424480438232, + "learning_rate": 1e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.6852819919586182, + "num_tokens": 183899771.0, + "step": 7107 + }, + { + "epoch": 0.7805842301779047, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2281618118286133, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7016698122024536, + "num_tokens": 183927184.0, + "step": 7108 + }, + { + "epoch": 0.7806940478805183, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.0970816612243652, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7028419375419617, + "num_tokens": 183957129.0, + "step": 7109 + }, + { + "epoch": 0.780803865583132, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1456246376037598, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.7021613121032715, + "num_tokens": 183985840.0, + "step": 7110 + }, + { + "epoch": 0.7809136832857456, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.292158603668213, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6906818151473999, + "num_tokens": 184012366.0, + "step": 7111 + }, + { + "epoch": 0.7810235009883594, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.249016284942627, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6865242719650269, + "num_tokens": 184040951.0, + "step": 7112 + }, + { + "epoch": 0.781133318690973, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3681302070617676, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7101967930793762, + "num_tokens": 184067588.0, + "step": 7113 + }, + { + "epoch": 0.7812431363935867, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.547257661819458, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7316044569015503, + "num_tokens": 184089857.0, + "step": 7114 + }, + { + "epoch": 0.7813529540962003, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.459000825881958, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7192953824996948, + "num_tokens": 184113498.0, + "step": 7115 + }, + { + "epoch": 0.781462771798814, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.437479257583618, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6898089647293091, + "num_tokens": 184137404.0, + "step": 7116 + }, + { + "epoch": 0.7815725895014276, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.664350986480713, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.698890209197998, + "num_tokens": 184159063.0, + "step": 7117 + }, + { + "epoch": 0.7816824072040413, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4156572818756104, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7111048698425293, + "num_tokens": 184181374.0, + "step": 7118 + }, + { + "epoch": 0.7817922249066549, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.480105400085449, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7158242464065552, + "num_tokens": 184204543.0, + "step": 7119 + }, + { + "epoch": 0.7819020426092687, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.478327751159668, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.710374116897583, + "num_tokens": 184227436.0, + "step": 7120 + }, + { + "epoch": 0.7820118603118823, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.434431791305542, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7143872976303101, + "num_tokens": 184252789.0, + "step": 7121 + }, + { + "epoch": 0.782121678014496, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.224766731262207, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7129796147346497, + "num_tokens": 184279574.0, + "step": 7122 + }, + { + "epoch": 0.7822314957171096, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.274127721786499, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6956443786621094, + "num_tokens": 184311071.0, + "step": 7123 + }, + { + "epoch": 0.7823413134197232, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2240419387817383, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6834922432899475, + "num_tokens": 184339486.0, + "step": 7124 + }, + { + "epoch": 0.7824511311223369, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.475177049636841, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7222459316253662, + "num_tokens": 184362691.0, + "step": 7125 + }, + { + "epoch": 0.7825609488249505, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1204192638397217, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7009791731834412, + "num_tokens": 184391019.0, + "step": 7126 + }, + { + "epoch": 0.7826707665275643, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4547297954559326, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7183204293251038, + "num_tokens": 184412939.0, + "step": 7127 + }, + { + "epoch": 0.7827805842301779, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.202230930328369, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7118722796440125, + "num_tokens": 184441180.0, + "step": 7128 + }, + { + "epoch": 0.7828904019327916, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.114915132522583, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7265182733535767, + "num_tokens": 184471774.0, + "step": 7129 + }, + { + "epoch": 0.7830002196354052, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.530054807662964, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7077051401138306, + "num_tokens": 184494838.0, + "step": 7130 + }, + { + "epoch": 0.7831100373380189, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2806315422058105, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7197431325912476, + "num_tokens": 184520574.0, + "step": 7131 + }, + { + "epoch": 0.7832198550406325, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.580599546432495, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7094660401344299, + "num_tokens": 184542020.0, + "step": 7132 + }, + { + "epoch": 0.7833296727432462, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.399264097213745, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6846038103103638, + "num_tokens": 184567106.0, + "step": 7133 + }, + { + "epoch": 0.7834394904458599, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.254845380783081, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7119172811508179, + "num_tokens": 184592293.0, + "step": 7134 + }, + { + "epoch": 0.7835493081484736, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.509911298751831, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6782640814781189, + "num_tokens": 184614675.0, + "step": 7135 + }, + { + "epoch": 0.7836591258510872, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.237921953201294, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6892298460006714, + "num_tokens": 184643311.0, + "step": 7136 + }, + { + "epoch": 0.7837689435537009, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2339344024658203, + "learning_rate": 1e-06, + "loss": 1.0905, + "mean_token_accuracy": 0.6849364042282104, + "num_tokens": 184671028.0, + "step": 7137 + }, + { + "epoch": 0.7838787612563145, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2131412029266357, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7003872394561768, + "num_tokens": 184697887.0, + "step": 7138 + }, + { + "epoch": 0.7839885789589282, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1965863704681396, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7086437940597534, + "num_tokens": 184728433.0, + "step": 7139 + }, + { + "epoch": 0.7840983966615418, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.445537567138672, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7064675092697144, + "num_tokens": 184754313.0, + "step": 7140 + }, + { + "epoch": 0.7842082143641556, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.134678840637207, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6947428584098816, + "num_tokens": 184785843.0, + "step": 7141 + }, + { + "epoch": 0.7843180320667692, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2951407432556152, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7152801752090454, + "num_tokens": 184811996.0, + "step": 7142 + }, + { + "epoch": 0.7844278497693828, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5372908115386963, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6892151236534119, + "num_tokens": 184834675.0, + "step": 7143 + }, + { + "epoch": 0.7845376674719965, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1440927982330322, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7071921825408936, + "num_tokens": 184863266.0, + "step": 7144 + }, + { + "epoch": 0.7846474851746101, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4660844802856445, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.721197247505188, + "num_tokens": 184886024.0, + "step": 7145 + }, + { + "epoch": 0.7847573028772238, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.538973569869995, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.696243405342102, + "num_tokens": 184909965.0, + "step": 7146 + }, + { + "epoch": 0.7848671205798374, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5393617153167725, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7176443934440613, + "num_tokens": 184931186.0, + "step": 7147 + }, + { + "epoch": 0.7849769382824512, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.340979814529419, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6872562170028687, + "num_tokens": 184959886.0, + "step": 7148 + }, + { + "epoch": 0.7850867559850648, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.310131072998047, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7302518486976624, + "num_tokens": 184983109.0, + "step": 7149 + }, + { + "epoch": 0.7851965736876785, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3821990489959717, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6841680407524109, + "num_tokens": 185012198.0, + "step": 7150 + }, + { + "epoch": 0.7853063913902921, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4299254417419434, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7364633679389954, + "num_tokens": 185033034.0, + "step": 7151 + }, + { + "epoch": 0.7854162090929058, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2659616470336914, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6957589387893677, + "num_tokens": 185059299.0, + "step": 7152 + }, + { + "epoch": 0.7855260267955194, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2194905281066895, + "learning_rate": 1e-06, + "loss": 1.1242, + "mean_token_accuracy": 0.6628208160400391, + "num_tokens": 185087271.0, + "step": 7153 + }, + { + "epoch": 0.7856358444981331, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3857171535491943, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6976976990699768, + "num_tokens": 185113305.0, + "step": 7154 + }, + { + "epoch": 0.7857456622007467, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.6078150272369385, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7130486369132996, + "num_tokens": 185134830.0, + "step": 7155 + }, + { + "epoch": 0.7858554799033605, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.565760850906372, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7226216793060303, + "num_tokens": 185154646.0, + "step": 7156 + }, + { + "epoch": 0.7859652976059741, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2704057693481445, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6945022344589233, + "num_tokens": 185183577.0, + "step": 7157 + }, + { + "epoch": 0.7860751153085878, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.9078311920166016, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7177928686141968, + "num_tokens": 185203013.0, + "step": 7158 + }, + { + "epoch": 0.7861849330112014, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2351295948028564, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6844264268875122, + "num_tokens": 185229869.0, + "step": 7159 + }, + { + "epoch": 0.786294750713815, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3837718963623047, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6956189274787903, + "num_tokens": 185255496.0, + "step": 7160 + }, + { + "epoch": 0.7864045684164287, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1642820835113525, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6916369199752808, + "num_tokens": 185284771.0, + "step": 7161 + }, + { + "epoch": 0.7865143861190423, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5192253589630127, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7222774028778076, + "num_tokens": 185306814.0, + "step": 7162 + }, + { + "epoch": 0.7866242038216561, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4410178661346436, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6871722936630249, + "num_tokens": 185331860.0, + "step": 7163 + }, + { + "epoch": 0.7867340215242697, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2289819717407227, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6997359991073608, + "num_tokens": 185359863.0, + "step": 7164 + }, + { + "epoch": 0.7868438392268834, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.275050163269043, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.695732057094574, + "num_tokens": 185387084.0, + "step": 7165 + }, + { + "epoch": 0.786953656929497, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3223352432250977, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7244747877120972, + "num_tokens": 185413133.0, + "step": 7166 + }, + { + "epoch": 0.7870634746321107, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3258934020996094, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6911168098449707, + "num_tokens": 185437925.0, + "step": 7167 + }, + { + "epoch": 0.7871732923347243, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.267310857772827, + "learning_rate": 1e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6798920631408691, + "num_tokens": 185466086.0, + "step": 7168 + }, + { + "epoch": 0.787283110037338, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2420530319213867, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7093408107757568, + "num_tokens": 185493799.0, + "step": 7169 + }, + { + "epoch": 0.7873929277399517, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.198737621307373, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6988539695739746, + "num_tokens": 185522535.0, + "step": 7170 + }, + { + "epoch": 0.7875027454425654, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.134011745452881, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6813311576843262, + "num_tokens": 185556122.0, + "step": 7171 + }, + { + "epoch": 0.787612563145179, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3462984561920166, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.715266764163971, + "num_tokens": 185582159.0, + "step": 7172 + }, + { + "epoch": 0.7877223808477927, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5354275703430176, + "learning_rate": 1e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.6774687767028809, + "num_tokens": 185607770.0, + "step": 7173 + }, + { + "epoch": 0.7878321985504063, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.328376531600952, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7147327661514282, + "num_tokens": 185633153.0, + "step": 7174 + }, + { + "epoch": 0.78794201625302, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2312560081481934, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7104520797729492, + "num_tokens": 185662012.0, + "step": 7175 + }, + { + "epoch": 0.7880518339556336, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1410937309265137, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6858042478561401, + "num_tokens": 185691755.0, + "step": 7176 + }, + { + "epoch": 0.7881616516582474, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.373807668685913, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7011723518371582, + "num_tokens": 185715651.0, + "step": 7177 + }, + { + "epoch": 0.788271469360861, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.336665391921997, + "learning_rate": 1e-06, + "loss": 1.1036, + "mean_token_accuracy": 0.6759241819381714, + "num_tokens": 185741311.0, + "step": 7178 + }, + { + "epoch": 0.7883812870634747, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2637135982513428, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7205657362937927, + "num_tokens": 185768557.0, + "step": 7179 + }, + { + "epoch": 0.7884911047660883, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.340874195098877, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6885517239570618, + "num_tokens": 185795015.0, + "step": 7180 + }, + { + "epoch": 0.788600922468702, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.361914873123169, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6904143691062927, + "num_tokens": 185819116.0, + "step": 7181 + }, + { + "epoch": 0.7887107401713156, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.5088725090026855, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7068368196487427, + "num_tokens": 185841426.0, + "step": 7182 + }, + { + "epoch": 0.7888205578739292, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.383513927459717, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7006433010101318, + "num_tokens": 185866540.0, + "step": 7183 + }, + { + "epoch": 0.7889303755765429, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.329808473587036, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6999297738075256, + "num_tokens": 185892827.0, + "step": 7184 + }, + { + "epoch": 0.7890401932791566, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.611149311065674, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.726608395576477, + "num_tokens": 185913702.0, + "step": 7185 + }, + { + "epoch": 0.7891500109817703, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.317065715789795, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7122766375541687, + "num_tokens": 185939475.0, + "step": 7186 + }, + { + "epoch": 0.7892598286843839, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.086798667907715, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6855383515357971, + "num_tokens": 185972425.0, + "step": 7187 + }, + { + "epoch": 0.7893696463869976, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.073239803314209, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.694807767868042, + "num_tokens": 186007050.0, + "step": 7188 + }, + { + "epoch": 0.7894794640896112, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4424233436584473, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7133631706237793, + "num_tokens": 186030479.0, + "step": 7189 + }, + { + "epoch": 0.7895892817922249, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1174733638763428, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.699090301990509, + "num_tokens": 186061037.0, + "step": 7190 + }, + { + "epoch": 0.7896990994948385, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3490920066833496, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7038685083389282, + "num_tokens": 186086817.0, + "step": 7191 + }, + { + "epoch": 0.7898089171974523, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.225896120071411, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7091919779777527, + "num_tokens": 186114481.0, + "step": 7192 + }, + { + "epoch": 0.7899187349000659, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2544991970062256, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6874271035194397, + "num_tokens": 186142585.0, + "step": 7193 + }, + { + "epoch": 0.7900285526026796, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3730251789093018, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.7004503011703491, + "num_tokens": 186170466.0, + "step": 7194 + }, + { + "epoch": 0.7901383703052932, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.291177749633789, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.704716682434082, + "num_tokens": 186197787.0, + "step": 7195 + }, + { + "epoch": 0.7902481880079069, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.49874210357666, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7151634097099304, + "num_tokens": 186220283.0, + "step": 7196 + }, + { + "epoch": 0.7903580057105205, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1375083923339844, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7004952430725098, + "num_tokens": 186250044.0, + "step": 7197 + }, + { + "epoch": 0.7904678234131342, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.189380645751953, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6867698431015015, + "num_tokens": 186279918.0, + "step": 7198 + }, + { + "epoch": 0.7905776411157479, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.384723424911499, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6945076584815979, + "num_tokens": 186306253.0, + "step": 7199 + }, + { + "epoch": 0.7906874588183616, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1909472942352295, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7076669931411743, + "num_tokens": 186334988.0, + "step": 7200 + }, + { + "epoch": 0.7907972765209752, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.32454252243042, + "learning_rate": 1e-06, + "loss": 1.1028, + "mean_token_accuracy": 0.675200343132019, + "num_tokens": 186362393.0, + "step": 7201 + }, + { + "epoch": 0.7909070942235888, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.111908435821533, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.724714994430542, + "num_tokens": 186391206.0, + "step": 7202 + }, + { + "epoch": 0.7910169119262025, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4414072036743164, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7030921578407288, + "num_tokens": 186420976.0, + "step": 7203 + }, + { + "epoch": 0.7911267296288161, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4015159606933594, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7089754939079285, + "num_tokens": 186447364.0, + "step": 7204 + }, + { + "epoch": 0.7912365473314298, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.6660494804382324, + "learning_rate": 1e-06, + "loss": 1.1082, + "mean_token_accuracy": 0.6921595335006714, + "num_tokens": 186470996.0, + "step": 7205 + }, + { + "epoch": 0.7913463650340435, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.261939525604248, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7256917357444763, + "num_tokens": 186496877.0, + "step": 7206 + }, + { + "epoch": 0.7914561827366572, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4879536628723145, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7172626852989197, + "num_tokens": 186519265.0, + "step": 7207 + }, + { + "epoch": 0.7915660004392708, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2176058292388916, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6991138458251953, + "num_tokens": 186548132.0, + "step": 7208 + }, + { + "epoch": 0.7916758181418845, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.232560396194458, + "learning_rate": 1e-06, + "loss": 1.1557, + "mean_token_accuracy": 0.6631619930267334, + "num_tokens": 186577678.0, + "step": 7209 + }, + { + "epoch": 0.7917856358444981, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.069565773010254, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.707949697971344, + "num_tokens": 186608047.0, + "step": 7210 + }, + { + "epoch": 0.7918954535471118, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.9998618364334106, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6975467205047607, + "num_tokens": 186642725.0, + "step": 7211 + }, + { + "epoch": 0.7920052712497254, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.0681843757629395, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7041683197021484, + "num_tokens": 186673867.0, + "step": 7212 + }, + { + "epoch": 0.7921150889523391, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.117095708847046, + "learning_rate": 1e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6891582012176514, + "num_tokens": 186704477.0, + "step": 7213 + }, + { + "epoch": 0.7922249066549528, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4179553985595703, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6948275566101074, + "num_tokens": 186730133.0, + "step": 7214 + }, + { + "epoch": 0.7923347243575665, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.8291449546813965, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7069897651672363, + "num_tokens": 186750994.0, + "step": 7215 + }, + { + "epoch": 0.7924445420601801, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.73274827003479, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.711126446723938, + "num_tokens": 186770534.0, + "step": 7216 + }, + { + "epoch": 0.7925543597627938, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3094890117645264, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6924702525138855, + "num_tokens": 186798520.0, + "step": 7217 + }, + { + "epoch": 0.7926641774654074, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.167163610458374, + "learning_rate": 1e-06, + "loss": 1.1244, + "mean_token_accuracy": 0.6712667942047119, + "num_tokens": 186828948.0, + "step": 7218 + }, + { + "epoch": 0.792773995168021, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.390346050262451, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6973044276237488, + "num_tokens": 186857082.0, + "step": 7219 + }, + { + "epoch": 0.7928838128706347, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.616236448287964, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6877064108848572, + "num_tokens": 186880348.0, + "step": 7220 + }, + { + "epoch": 0.7929936305732485, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.754686117172241, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7041454911231995, + "num_tokens": 186899871.0, + "step": 7221 + }, + { + "epoch": 0.7931034482758621, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2151236534118652, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.689687967300415, + "num_tokens": 186930246.0, + "step": 7222 + }, + { + "epoch": 0.7932132659784757, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2651755809783936, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7102102041244507, + "num_tokens": 186957223.0, + "step": 7223 + }, + { + "epoch": 0.7933230836810894, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.528193473815918, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7051312923431396, + "num_tokens": 186980450.0, + "step": 7224 + }, + { + "epoch": 0.793432901383703, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.406123161315918, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7096655964851379, + "num_tokens": 187004492.0, + "step": 7225 + }, + { + "epoch": 0.7935427190863167, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.330400228500366, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.712981641292572, + "num_tokens": 187030339.0, + "step": 7226 + }, + { + "epoch": 0.7936525367889303, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4933383464813232, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6871938109397888, + "num_tokens": 187057485.0, + "step": 7227 + }, + { + "epoch": 0.7937623544915441, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.6189050674438477, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7059066295623779, + "num_tokens": 187077206.0, + "step": 7228 + }, + { + "epoch": 0.7938721721941577, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2767810821533203, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7095161080360413, + "num_tokens": 187104941.0, + "step": 7229 + }, + { + "epoch": 0.7939819898967714, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.625901460647583, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7212498188018799, + "num_tokens": 187127950.0, + "step": 7230 + }, + { + "epoch": 0.794091807599385, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.384277820587158, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6883581876754761, + "num_tokens": 187153998.0, + "step": 7231 + }, + { + "epoch": 0.7942016253019987, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.466576099395752, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7222809791564941, + "num_tokens": 187174998.0, + "step": 7232 + }, + { + "epoch": 0.7943114430046123, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.296964406967163, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7072082161903381, + "num_tokens": 187201294.0, + "step": 7233 + }, + { + "epoch": 0.794421260707226, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2264950275421143, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6959766745567322, + "num_tokens": 187229083.0, + "step": 7234 + }, + { + "epoch": 0.7945310784098397, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.4504928588867188, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7102810740470886, + "num_tokens": 187250489.0, + "step": 7235 + }, + { + "epoch": 0.7946408961124534, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2594902515411377, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7164838314056396, + "num_tokens": 187277302.0, + "step": 7236 + }, + { + "epoch": 0.794750713815067, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.464784860610962, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7089122533798218, + "num_tokens": 187300062.0, + "step": 7237 + }, + { + "epoch": 0.7948605315176807, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.7409772872924805, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7168303728103638, + "num_tokens": 187319118.0, + "step": 7238 + }, + { + "epoch": 0.7949703492202943, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.399749279022217, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7040913105010986, + "num_tokens": 187345374.0, + "step": 7239 + }, + { + "epoch": 0.795080166922908, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.295473575592041, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7260780930519104, + "num_tokens": 187370379.0, + "step": 7240 + }, + { + "epoch": 0.7951899846255216, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.3156416416168213, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7184922099113464, + "num_tokens": 187395194.0, + "step": 7241 + }, + { + "epoch": 0.7952998023281352, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.455364465713501, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.70095294713974, + "num_tokens": 187418656.0, + "step": 7242 + }, + { + "epoch": 0.795409620030749, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.39304780960083, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.698310911655426, + "num_tokens": 187442710.0, + "step": 7243 + }, + { + "epoch": 0.7955194377333626, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.2378902435302734, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7092465162277222, + "num_tokens": 187470929.0, + "step": 7244 + }, + { + "epoch": 0.7956292554359763, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.1304478645324707, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7152354717254639, + "num_tokens": 187499503.0, + "step": 7245 + }, + { + "epoch": 0.7957390731385899, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.403374195098877, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.720609724521637, + "num_tokens": 187523173.0, + "step": 7246 + }, + { + "epoch": 0.7958488908412036, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.16161847114563, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7067830562591553, + "num_tokens": 187551392.0, + "step": 7247 + }, + { + "epoch": 0.7959587085438172, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.0492091178894043, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6843787431716919, + "num_tokens": 187583865.0, + "step": 7248 + }, + { + "epoch": 0.7960685262464309, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.535284996032715, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7102645039558411, + "num_tokens": 187605803.0, + "step": 7249 + }, + { + "epoch": 0.7961783439490446, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.500504732131958, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7002819180488586, + "num_tokens": 187628523.0, + "step": 7250 + }, + { + "epoch": 0.7962881616516583, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.6581995487213135, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7008628845214844, + "num_tokens": 187648844.0, + "step": 7251 + }, + { + "epoch": 0.7963979793542719, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.8189704418182373, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6957396864891052, + "num_tokens": 187668170.0, + "step": 7252 + }, + { + "epoch": 0.7965077970568856, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.273898124694824, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7052620053291321, + "num_tokens": 187695231.0, + "step": 7253 + }, + { + "epoch": 0.7966176147594992, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.60567307472229, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7472978830337524, + "num_tokens": 187716433.0, + "step": 7254 + }, + { + "epoch": 0.7967274324621129, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.5471768379211426, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7368134260177612, + "num_tokens": 187736921.0, + "step": 7255 + }, + { + "epoch": 0.7968372501647265, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.300567150115967, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6953587532043457, + "num_tokens": 187761659.0, + "step": 7256 + }, + { + "epoch": 0.7969470678673403, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.171696186065674, + "learning_rate": 1e-06, + "loss": 1.1106, + "mean_token_accuracy": 0.6742806434631348, + "num_tokens": 187794860.0, + "step": 7257 + }, + { + "epoch": 0.7970568855699539, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.269801378250122, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7009201049804688, + "num_tokens": 187822637.0, + "step": 7258 + }, + { + "epoch": 0.7971667032725676, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.129275321960449, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.7006694674491882, + "num_tokens": 187853797.0, + "step": 7259 + }, + { + "epoch": 0.7972765209751812, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.5828895568847656, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7169480323791504, + "num_tokens": 187874537.0, + "step": 7260 + }, + { + "epoch": 0.7973863386777948, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.0963616371154785, + "learning_rate": 1e-06, + "loss": 1.1234, + "mean_token_accuracy": 0.6694895625114441, + "num_tokens": 187907296.0, + "step": 7261 + }, + { + "epoch": 0.7974961563804085, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.31667423248291, + "learning_rate": 1e-06, + "loss": 1.1545, + "mean_token_accuracy": 0.6625500917434692, + "num_tokens": 187935828.0, + "step": 7262 + }, + { + "epoch": 0.7976059740830221, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4183409214019775, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7235181331634521, + "num_tokens": 187958585.0, + "step": 7263 + }, + { + "epoch": 0.7977157917856359, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.5535173416137695, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7161314487457275, + "num_tokens": 187979118.0, + "step": 7264 + }, + { + "epoch": 0.7978256094882495, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.455961227416992, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7039775848388672, + "num_tokens": 188005438.0, + "step": 7265 + }, + { + "epoch": 0.7979354271908632, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.5281903743743896, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7059151530265808, + "num_tokens": 188029090.0, + "step": 7266 + }, + { + "epoch": 0.7980452448934768, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.3691887855529785, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6778416037559509, + "num_tokens": 188053893.0, + "step": 7267 + }, + { + "epoch": 0.7981550625960905, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4560248851776123, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6954477429389954, + "num_tokens": 188078874.0, + "step": 7268 + }, + { + "epoch": 0.7982648802987041, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.509803533554077, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6982523202896118, + "num_tokens": 188105436.0, + "step": 7269 + }, + { + "epoch": 0.7983746980013178, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.208340883255005, + "learning_rate": 1e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6858274936676025, + "num_tokens": 188139579.0, + "step": 7270 + }, + { + "epoch": 0.7984845157039314, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.3165409564971924, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7158365249633789, + "num_tokens": 188165248.0, + "step": 7271 + }, + { + "epoch": 0.7985943334065452, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.215529680252075, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6916900277137756, + "num_tokens": 188192813.0, + "step": 7272 + }, + { + "epoch": 0.7987041511091588, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.721696138381958, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7130817174911499, + "num_tokens": 188212555.0, + "step": 7273 + }, + { + "epoch": 0.7988139688117725, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.293170690536499, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7005037069320679, + "num_tokens": 188240737.0, + "step": 7274 + }, + { + "epoch": 0.7989237865143861, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.143537998199463, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7034242749214172, + "num_tokens": 188268722.0, + "step": 7275 + }, + { + "epoch": 0.7990336042169998, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.464508533477783, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6824201345443726, + "num_tokens": 188295061.0, + "step": 7276 + }, + { + "epoch": 0.7991434219196134, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.321645975112915, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.715835690498352, + "num_tokens": 188322008.0, + "step": 7277 + }, + { + "epoch": 0.799253239622227, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.239401340484619, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6864930391311646, + "num_tokens": 188351369.0, + "step": 7278 + }, + { + "epoch": 0.7993630573248408, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4194722175598145, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7202212810516357, + "num_tokens": 188374342.0, + "step": 7279 + }, + { + "epoch": 0.7994728750274545, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.2291829586029053, + "learning_rate": 1e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.6799410581588745, + "num_tokens": 188403080.0, + "step": 7280 + }, + { + "epoch": 0.7995826927300681, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.1418120861053467, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6855565309524536, + "num_tokens": 188434808.0, + "step": 7281 + }, + { + "epoch": 0.7996925104326817, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.168872594833374, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.7111184597015381, + "num_tokens": 188465375.0, + "step": 7282 + }, + { + "epoch": 0.7998023281352954, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.0793004035949707, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7057371735572815, + "num_tokens": 188495748.0, + "step": 7283 + }, + { + "epoch": 0.799912145837909, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 2.6280388832092285, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7040642499923706, + "num_tokens": 188517345.0, + "step": 7284 + }, + { + "epoch": 0.8000219635405227, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.3160760402679443, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7063175439834595, + "num_tokens": 188543347.0, + "step": 7285 + }, + { + "epoch": 0.8001317812431364, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.0110270977020264, + "learning_rate": 1e-06, + "loss": 1.1272, + "mean_token_accuracy": 0.6772791147232056, + "num_tokens": 188577493.0, + "step": 7286 + }, + { + "epoch": 0.8002415989457501, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.2385098934173584, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7105945348739624, + "num_tokens": 188602905.0, + "step": 7287 + }, + { + "epoch": 0.8003514166483637, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.3186771869659424, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.706386148929596, + "num_tokens": 188629470.0, + "step": 7288 + }, + { + "epoch": 0.8004612343509774, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.2291805744171143, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6931589841842651, + "num_tokens": 188657305.0, + "step": 7289 + }, + { + "epoch": 0.800571052053591, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.277127265930176, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6798670291900635, + "num_tokens": 188685231.0, + "step": 7290 + }, + { + "epoch": 0.8006808697562047, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2160112857818604, + "learning_rate": 1e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6830387115478516, + "num_tokens": 188719881.0, + "step": 7291 + }, + { + "epoch": 0.8007906874588183, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.5329084396362305, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7311639189720154, + "num_tokens": 188740271.0, + "step": 7292 + }, + { + "epoch": 0.8009005051614321, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.0877718925476074, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7160096764564514, + "num_tokens": 188771459.0, + "step": 7293 + }, + { + "epoch": 0.8010103228640457, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.495471954345703, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7178316116333008, + "num_tokens": 188796163.0, + "step": 7294 + }, + { + "epoch": 0.8011201405666594, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.590677499771118, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7074726819992065, + "num_tokens": 188818117.0, + "step": 7295 + }, + { + "epoch": 0.801229958269273, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.4929745197296143, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7067809104919434, + "num_tokens": 188839835.0, + "step": 7296 + }, + { + "epoch": 0.8013397759718867, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.354236364364624, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7038788795471191, + "num_tokens": 188864548.0, + "step": 7297 + }, + { + "epoch": 0.8014495936745003, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.2081010341644287, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6944745779037476, + "num_tokens": 188894610.0, + "step": 7298 + }, + { + "epoch": 0.801559411377114, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.6846961975097656, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7199313640594482, + "num_tokens": 188915161.0, + "step": 7299 + }, + { + "epoch": 0.8016692290797276, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.37825608253479, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6867629289627075, + "num_tokens": 188944775.0, + "step": 7300 + }, + { + "epoch": 0.8017790467823414, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2725727558135986, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7080659866333008, + "num_tokens": 188973964.0, + "step": 7301 + }, + { + "epoch": 0.801888864484955, + "ewc_loss": 1.3768672943115234e-05, + "grad_norm": 2.358351945877075, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7096173763275146, + "num_tokens": 189000178.0, + "step": 7302 + }, + { + "epoch": 0.8019986821875686, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.30128812789917, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.7002277374267578, + "num_tokens": 189030193.0, + "step": 7303 + }, + { + "epoch": 0.8021084998901823, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.627622604370117, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6856766939163208, + "num_tokens": 189053274.0, + "step": 7304 + }, + { + "epoch": 0.8022183175927959, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2052366733551025, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6883993148803711, + "num_tokens": 189082363.0, + "step": 7305 + }, + { + "epoch": 0.8023281352954096, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.260446548461914, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.694195032119751, + "num_tokens": 189109955.0, + "step": 7306 + }, + { + "epoch": 0.8024379529980232, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.198131561279297, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7212949991226196, + "num_tokens": 189134395.0, + "step": 7307 + }, + { + "epoch": 0.802547770700637, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.278747320175171, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7013170123100281, + "num_tokens": 189162058.0, + "step": 7308 + }, + { + "epoch": 0.8026575884032506, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.292335033416748, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6955451965332031, + "num_tokens": 189187227.0, + "step": 7309 + }, + { + "epoch": 0.8027674061058643, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.293881416320801, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7143151164054871, + "num_tokens": 189212907.0, + "step": 7310 + }, + { + "epoch": 0.8028772238084779, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2163424491882324, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7084248661994934, + "num_tokens": 189241018.0, + "step": 7311 + }, + { + "epoch": 0.8029870415110916, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.496676445007324, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7021632194519043, + "num_tokens": 189263735.0, + "step": 7312 + }, + { + "epoch": 0.8030968592137052, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2641730308532715, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7228066921234131, + "num_tokens": 189293717.0, + "step": 7313 + }, + { + "epoch": 0.8032066769163189, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.477414131164551, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6982675790786743, + "num_tokens": 189316183.0, + "step": 7314 + }, + { + "epoch": 0.8033164946189326, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.27689528465271, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7027814388275146, + "num_tokens": 189343617.0, + "step": 7315 + }, + { + "epoch": 0.8034263123215463, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3372483253479004, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6950956583023071, + "num_tokens": 189371355.0, + "step": 7316 + }, + { + "epoch": 0.8035361300241599, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.89465069770813, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7026554942131042, + "num_tokens": 189391749.0, + "step": 7317 + }, + { + "epoch": 0.8036459477267736, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.231466770172119, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.689946174621582, + "num_tokens": 189421161.0, + "step": 7318 + }, + { + "epoch": 0.8037557654293872, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.7660131454467773, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7184326648712158, + "num_tokens": 189440703.0, + "step": 7319 + }, + { + "epoch": 0.8038655831320008, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.650843381881714, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7083552479743958, + "num_tokens": 189460939.0, + "step": 7320 + }, + { + "epoch": 0.8039754008346145, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.224965810775757, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6929513216018677, + "num_tokens": 189492736.0, + "step": 7321 + }, + { + "epoch": 0.8040852185372283, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.568178176879883, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6989495158195496, + "num_tokens": 189517571.0, + "step": 7322 + }, + { + "epoch": 0.8041950362398419, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.32493257522583, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7107496857643127, + "num_tokens": 189544763.0, + "step": 7323 + }, + { + "epoch": 0.8043048539424555, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.283663749694824, + "learning_rate": 1e-06, + "loss": 1.1188, + "mean_token_accuracy": 0.6814024448394775, + "num_tokens": 189571734.0, + "step": 7324 + }, + { + "epoch": 0.8044146716450692, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2038052082061768, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7140049934387207, + "num_tokens": 189599544.0, + "step": 7325 + }, + { + "epoch": 0.8045244893476828, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.601003646850586, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6924921274185181, + "num_tokens": 189620801.0, + "step": 7326 + }, + { + "epoch": 0.8046343070502965, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.4806196689605713, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6927925944328308, + "num_tokens": 189642798.0, + "step": 7327 + }, + { + "epoch": 0.8047441247529101, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.6089444160461426, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7119775414466858, + "num_tokens": 189667045.0, + "step": 7328 + }, + { + "epoch": 0.8048539424555239, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.4112789630889893, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7125436067581177, + "num_tokens": 189693170.0, + "step": 7329 + }, + { + "epoch": 0.8049637601581375, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.576395034790039, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6925795078277588, + "num_tokens": 189717385.0, + "step": 7330 + }, + { + "epoch": 0.8050735778607512, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.4221222400665283, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.703679084777832, + "num_tokens": 189743773.0, + "step": 7331 + }, + { + "epoch": 0.8051833955633648, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2624919414520264, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6818636655807495, + "num_tokens": 189775152.0, + "step": 7332 + }, + { + "epoch": 0.8052932132659785, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2611899375915527, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7331093549728394, + "num_tokens": 189801396.0, + "step": 7333 + }, + { + "epoch": 0.8054030309685921, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.516585350036621, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6873286962509155, + "num_tokens": 189824889.0, + "step": 7334 + }, + { + "epoch": 0.8055128486712058, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.6120247840881348, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7115042209625244, + "num_tokens": 189845178.0, + "step": 7335 + }, + { + "epoch": 0.8056226663738194, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.276777505874634, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7055709362030029, + "num_tokens": 189869934.0, + "step": 7336 + }, + { + "epoch": 0.8057324840764332, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2327628135681152, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6880029439926147, + "num_tokens": 189897957.0, + "step": 7337 + }, + { + "epoch": 0.8058423017790468, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2558743953704834, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6797524690628052, + "num_tokens": 189926967.0, + "step": 7338 + }, + { + "epoch": 0.8059521194816605, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.469184637069702, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7200194597244263, + "num_tokens": 189948959.0, + "step": 7339 + }, + { + "epoch": 0.8060619371842741, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.1091036796569824, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6985576748847961, + "num_tokens": 189979548.0, + "step": 7340 + }, + { + "epoch": 0.8061717548868877, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.484441041946411, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6878413558006287, + "num_tokens": 190003615.0, + "step": 7341 + }, + { + "epoch": 0.8062815725895014, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3342881202697754, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7278301119804382, + "num_tokens": 190027255.0, + "step": 7342 + }, + { + "epoch": 0.806391390292115, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.305638313293457, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7051591873168945, + "num_tokens": 190054949.0, + "step": 7343 + }, + { + "epoch": 0.8065012079947288, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.359255075454712, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7285003662109375, + "num_tokens": 190077434.0, + "step": 7344 + }, + { + "epoch": 0.8066110256973424, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3565406799316406, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7069875001907349, + "num_tokens": 190105048.0, + "step": 7345 + }, + { + "epoch": 0.8067208433999561, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.2068610191345215, + "learning_rate": 1e-06, + "loss": 1.17, + "mean_token_accuracy": 0.6638195514678955, + "num_tokens": 190135929.0, + "step": 7346 + }, + { + "epoch": 0.8068306611025697, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.1785221099853516, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6918144226074219, + "num_tokens": 190164509.0, + "step": 7347 + }, + { + "epoch": 0.8069404788051834, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3832290172576904, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7226682901382446, + "num_tokens": 190186916.0, + "step": 7348 + }, + { + "epoch": 0.807050296507797, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3682751655578613, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6947047710418701, + "num_tokens": 190215352.0, + "step": 7349 + }, + { + "epoch": 0.8071601142104107, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3245303630828857, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7127826809883118, + "num_tokens": 190240627.0, + "step": 7350 + }, + { + "epoch": 0.8072699319130244, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3504629135131836, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6985751390457153, + "num_tokens": 190266976.0, + "step": 7351 + }, + { + "epoch": 0.8073797496156381, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.531304359436035, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7230100631713867, + "num_tokens": 190290414.0, + "step": 7352 + }, + { + "epoch": 0.8074895673182517, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.537437677383423, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6880365610122681, + "num_tokens": 190314442.0, + "step": 7353 + }, + { + "epoch": 0.8075993850208654, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.28035569190979, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7255201935768127, + "num_tokens": 190341818.0, + "step": 7354 + }, + { + "epoch": 0.807709202723479, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.259770154953003, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7257682085037231, + "num_tokens": 190368152.0, + "step": 7355 + }, + { + "epoch": 0.8078190204260927, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.640584707260132, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7039310336112976, + "num_tokens": 190387620.0, + "step": 7356 + }, + { + "epoch": 0.8079288381287063, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.4548192024230957, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7053966522216797, + "num_tokens": 190411877.0, + "step": 7357 + }, + { + "epoch": 0.8080386558313201, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.4158740043640137, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7122557163238525, + "num_tokens": 190436207.0, + "step": 7358 + }, + { + "epoch": 0.8081484735339337, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.0378074645996094, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7036199569702148, + "num_tokens": 190469819.0, + "step": 7359 + }, + { + "epoch": 0.8082582912365474, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2013936042785645, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7102384567260742, + "num_tokens": 190497336.0, + "step": 7360 + }, + { + "epoch": 0.808368108939161, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.219292640686035, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6841385364532471, + "num_tokens": 190526504.0, + "step": 7361 + }, + { + "epoch": 0.8084779266417746, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.1483330726623535, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6935380101203918, + "num_tokens": 190559033.0, + "step": 7362 + }, + { + "epoch": 0.8085877443443883, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.832669496536255, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7244358658790588, + "num_tokens": 190576453.0, + "step": 7363 + }, + { + "epoch": 0.8086975620470019, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.443152666091919, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6953859329223633, + "num_tokens": 190601617.0, + "step": 7364 + }, + { + "epoch": 0.8088073797496156, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.614844799041748, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.695059061050415, + "num_tokens": 190622769.0, + "step": 7365 + }, + { + "epoch": 0.8089171974522293, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.1078436374664307, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7029914855957031, + "num_tokens": 190654941.0, + "step": 7366 + }, + { + "epoch": 0.809027015154843, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.3802614212036133, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6891459226608276, + "num_tokens": 190679722.0, + "step": 7367 + }, + { + "epoch": 0.8091368328574566, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2062594890594482, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6982213854789734, + "num_tokens": 190708192.0, + "step": 7368 + }, + { + "epoch": 0.8092466505600703, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.5100369453430176, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6956326961517334, + "num_tokens": 190731374.0, + "step": 7369 + }, + { + "epoch": 0.8093564682626839, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.3119168281555176, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6974397897720337, + "num_tokens": 190759008.0, + "step": 7370 + }, + { + "epoch": 0.8094662859652976, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2982113361358643, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7043992280960083, + "num_tokens": 190784306.0, + "step": 7371 + }, + { + "epoch": 0.8095761036679112, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.363386392593384, + "learning_rate": 1e-06, + "loss": 1.1108, + "mean_token_accuracy": 0.6786282062530518, + "num_tokens": 190812971.0, + "step": 7372 + }, + { + "epoch": 0.809685921370525, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.5175955295562744, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7139288187026978, + "num_tokens": 190835592.0, + "step": 7373 + }, + { + "epoch": 0.8097957390731386, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 2.3562026023864746, + "learning_rate": 1e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.681527853012085, + "num_tokens": 190864363.0, + "step": 7374 + }, + { + "epoch": 0.8099055567757523, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.915792942047119, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.7056174278259277, + "num_tokens": 190882458.0, + "step": 7375 + }, + { + "epoch": 0.8100153744783659, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.347217082977295, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6928355693817139, + "num_tokens": 190909073.0, + "step": 7376 + }, + { + "epoch": 0.8101251921809796, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.3704211711883545, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7158048152923584, + "num_tokens": 190933418.0, + "step": 7377 + }, + { + "epoch": 0.8102350098835932, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.4250335693359375, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6892357468605042, + "num_tokens": 190957448.0, + "step": 7378 + }, + { + "epoch": 0.8103448275862069, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2726428508758545, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7185996174812317, + "num_tokens": 190984754.0, + "step": 7379 + }, + { + "epoch": 0.8104546452888206, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.619483470916748, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6956371068954468, + "num_tokens": 191007254.0, + "step": 7380 + }, + { + "epoch": 0.8105644629914343, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.535036325454712, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7047082185745239, + "num_tokens": 191030203.0, + "step": 7381 + }, + { + "epoch": 0.8106742806940479, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.478069543838501, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6996060013771057, + "num_tokens": 191053705.0, + "step": 7382 + }, + { + "epoch": 0.8107840983966615, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.244659185409546, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7077032327651978, + "num_tokens": 191081547.0, + "step": 7383 + }, + { + "epoch": 0.8108939160992752, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.8531885147094727, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7215703129768372, + "num_tokens": 191100151.0, + "step": 7384 + }, + { + "epoch": 0.8110037338018888, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.1935548782348633, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.714423656463623, + "num_tokens": 191128014.0, + "step": 7385 + }, + { + "epoch": 0.8111135515045025, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2679994106292725, + "learning_rate": 1e-06, + "loss": 1.1038, + "mean_token_accuracy": 0.6806116104125977, + "num_tokens": 191154703.0, + "step": 7386 + }, + { + "epoch": 0.8112233692071162, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2850470542907715, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7035770416259766, + "num_tokens": 191180285.0, + "step": 7387 + }, + { + "epoch": 0.8113331869097299, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.5724093914031982, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6978203654289246, + "num_tokens": 191201760.0, + "step": 7388 + }, + { + "epoch": 0.8114430046123435, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.179384708404541, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6955917477607727, + "num_tokens": 191231226.0, + "step": 7389 + }, + { + "epoch": 0.8115528223149572, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.5380327701568604, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6803284883499146, + "num_tokens": 191254797.0, + "step": 7390 + }, + { + "epoch": 0.8116626400175708, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.5307791233062744, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7219038605690002, + "num_tokens": 191278029.0, + "step": 7391 + }, + { + "epoch": 0.8117724577201845, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.163051128387451, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7001755237579346, + "num_tokens": 191306205.0, + "step": 7392 + }, + { + "epoch": 0.8118822754227981, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.3403756618499756, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.688069760799408, + "num_tokens": 191331158.0, + "step": 7393 + }, + { + "epoch": 0.8119920931254118, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.219846487045288, + "learning_rate": 1e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6909351348876953, + "num_tokens": 191362487.0, + "step": 7394 + }, + { + "epoch": 0.8121019108280255, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2750015258789062, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6991948485374451, + "num_tokens": 191389257.0, + "step": 7395 + }, + { + "epoch": 0.8122117285306392, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3878893852233887, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6989724040031433, + "num_tokens": 191414774.0, + "step": 7396 + }, + { + "epoch": 0.8123215462332528, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4546635150909424, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7122257947921753, + "num_tokens": 191438055.0, + "step": 7397 + }, + { + "epoch": 0.8124313639358665, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4116978645324707, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6974902153015137, + "num_tokens": 191461414.0, + "step": 7398 + }, + { + "epoch": 0.8125411816384801, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4579877853393555, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7166422009468079, + "num_tokens": 191485426.0, + "step": 7399 + }, + { + "epoch": 0.8126509993410937, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2821903228759766, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7017700672149658, + "num_tokens": 191513727.0, + "step": 7400 + }, + { + "epoch": 0.8127608170437074, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.308520793914795, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.684675931930542, + "num_tokens": 191541044.0, + "step": 7401 + }, + { + "epoch": 0.8128706347463212, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.9829845428466797, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7317484617233276, + "num_tokens": 191575243.0, + "step": 7402 + }, + { + "epoch": 0.8129804524489348, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.328840732574463, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7120404243469238, + "num_tokens": 191599817.0, + "step": 7403 + }, + { + "epoch": 0.8130902701515484, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.138044834136963, + "learning_rate": 1e-06, + "loss": 1.1194, + "mean_token_accuracy": 0.6701014041900635, + "num_tokens": 191633627.0, + "step": 7404 + }, + { + "epoch": 0.8132000878541621, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2630655765533447, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6871938705444336, + "num_tokens": 191661966.0, + "step": 7405 + }, + { + "epoch": 0.8133099055567757, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.277597427368164, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6989137530326843, + "num_tokens": 191689951.0, + "step": 7406 + }, + { + "epoch": 0.8134197232593894, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.327146291732788, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6793947219848633, + "num_tokens": 191716482.0, + "step": 7407 + }, + { + "epoch": 0.813529540962003, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.4711475372314453, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.7014350891113281, + "num_tokens": 191741044.0, + "step": 7408 + }, + { + "epoch": 0.8136393586646168, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.192729949951172, + "learning_rate": 1e-06, + "loss": 1.1526, + "mean_token_accuracy": 0.6697111129760742, + "num_tokens": 191772667.0, + "step": 7409 + }, + { + "epoch": 0.8137491763672304, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.217834949493408, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7094135284423828, + "num_tokens": 191801028.0, + "step": 7410 + }, + { + "epoch": 0.8138589940698441, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.315903902053833, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7027326822280884, + "num_tokens": 191825693.0, + "step": 7411 + }, + { + "epoch": 0.8139688117724577, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.325221061706543, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7169455885887146, + "num_tokens": 191850044.0, + "step": 7412 + }, + { + "epoch": 0.8140786294750714, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.4358861446380615, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7063242793083191, + "num_tokens": 191873183.0, + "step": 7413 + }, + { + "epoch": 0.814188447177685, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 2.2861251831054688, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7054149508476257, + "num_tokens": 191899944.0, + "step": 7414 + }, + { + "epoch": 0.8142982648802987, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2770450115203857, + "learning_rate": 1e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6765667200088501, + "num_tokens": 191927847.0, + "step": 7415 + }, + { + "epoch": 0.8144080825829124, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.351785659790039, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7013635635375977, + "num_tokens": 191953831.0, + "step": 7416 + }, + { + "epoch": 0.8145179002855261, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3316025733947754, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6967296004295349, + "num_tokens": 191979703.0, + "step": 7417 + }, + { + "epoch": 0.8146277179881397, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.236017942428589, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7206462621688843, + "num_tokens": 192007014.0, + "step": 7418 + }, + { + "epoch": 0.8147375356907534, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2857673168182373, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.683809220790863, + "num_tokens": 192034396.0, + "step": 7419 + }, + { + "epoch": 0.814847353393367, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.219031810760498, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6870315670967102, + "num_tokens": 192065478.0, + "step": 7420 + }, + { + "epoch": 0.8149571710959806, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3356969356536865, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6906858682632446, + "num_tokens": 192091541.0, + "step": 7421 + }, + { + "epoch": 0.8150669887985943, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.501950740814209, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7168159484863281, + "num_tokens": 192113111.0, + "step": 7422 + }, + { + "epoch": 0.8151768065012079, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3193016052246094, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6910862922668457, + "num_tokens": 192139874.0, + "step": 7423 + }, + { + "epoch": 0.8152866242038217, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.128584623336792, + "learning_rate": 1e-06, + "loss": 1.1, + "mean_token_accuracy": 0.6828427910804749, + "num_tokens": 192173021.0, + "step": 7424 + }, + { + "epoch": 0.8153964419064353, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2401304244995117, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7226896286010742, + "num_tokens": 192200042.0, + "step": 7425 + }, + { + "epoch": 0.815506259609049, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1263411045074463, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.7019500732421875, + "num_tokens": 192233652.0, + "step": 7426 + }, + { + "epoch": 0.8156160773116626, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3347809314727783, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6921350955963135, + "num_tokens": 192259177.0, + "step": 7427 + }, + { + "epoch": 0.8157258950142763, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.179671049118042, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6881073713302612, + "num_tokens": 192289215.0, + "step": 7428 + }, + { + "epoch": 0.8158357127168899, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3403584957122803, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7242575287818909, + "num_tokens": 192311361.0, + "step": 7429 + }, + { + "epoch": 0.8159455304195036, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3734233379364014, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7167571783065796, + "num_tokens": 192336211.0, + "step": 7430 + }, + { + "epoch": 0.8160553481221173, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3757264614105225, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7121649980545044, + "num_tokens": 192359588.0, + "step": 7431 + }, + { + "epoch": 0.816165165824731, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2961316108703613, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.689578652381897, + "num_tokens": 192387109.0, + "step": 7432 + }, + { + "epoch": 0.8162749835273446, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.301510810852051, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.7021673917770386, + "num_tokens": 192414021.0, + "step": 7433 + }, + { + "epoch": 0.8163848012299583, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.140500068664551, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6912168264389038, + "num_tokens": 192444406.0, + "step": 7434 + }, + { + "epoch": 0.8164946189325719, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.302882194519043, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.685005247592926, + "num_tokens": 192472462.0, + "step": 7435 + }, + { + "epoch": 0.8166044366351856, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2311174869537354, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6967750787734985, + "num_tokens": 192501003.0, + "step": 7436 + }, + { + "epoch": 0.8167142543377992, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.486515522003174, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7173274755477905, + "num_tokens": 192521744.0, + "step": 7437 + }, + { + "epoch": 0.816824072040413, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.08211350440979, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7178653478622437, + "num_tokens": 192551757.0, + "step": 7438 + }, + { + "epoch": 0.8169338897430266, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.102524518966675, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6891583204269409, + "num_tokens": 192585787.0, + "step": 7439 + }, + { + "epoch": 0.8170437074456403, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6743276119232178, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7149495482444763, + "num_tokens": 192605935.0, + "step": 7440 + }, + { + "epoch": 0.8171535251482539, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5075578689575195, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7100578546524048, + "num_tokens": 192629921.0, + "step": 7441 + }, + { + "epoch": 0.8172633428508675, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1563665866851807, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.714978039264679, + "num_tokens": 192660071.0, + "step": 7442 + }, + { + "epoch": 0.8173731605534812, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4911909103393555, + "learning_rate": 1e-06, + "loss": 1.097, + "mean_token_accuracy": 0.6750783324241638, + "num_tokens": 192686540.0, + "step": 7443 + }, + { + "epoch": 0.8174829782560948, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.18176007270813, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6856597661972046, + "num_tokens": 192718722.0, + "step": 7444 + }, + { + "epoch": 0.8175927959587086, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5040907859802246, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6978000998497009, + "num_tokens": 192743632.0, + "step": 7445 + }, + { + "epoch": 0.8177026136613222, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3281118869781494, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7284663319587708, + "num_tokens": 192768444.0, + "step": 7446 + }, + { + "epoch": 0.8178124313639359, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3334310054779053, + "learning_rate": 1e-06, + "loss": 1.1074, + "mean_token_accuracy": 0.6837062835693359, + "num_tokens": 192795814.0, + "step": 7447 + }, + { + "epoch": 0.8179222490665495, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.238896369934082, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7039597034454346, + "num_tokens": 192821027.0, + "step": 7448 + }, + { + "epoch": 0.8180320667691632, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3318214416503906, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6877304315567017, + "num_tokens": 192846075.0, + "step": 7449 + }, + { + "epoch": 0.8181418844717768, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.004106044769287, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7148981690406799, + "num_tokens": 192875195.0, + "step": 7450 + }, + { + "epoch": 0.8182517021743905, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.371009349822998, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6990658640861511, + "num_tokens": 192899409.0, + "step": 7451 + }, + { + "epoch": 0.8183615198770041, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2830049991607666, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7042287588119507, + "num_tokens": 192924224.0, + "step": 7452 + }, + { + "epoch": 0.8184713375796179, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.666391611099243, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7022565007209778, + "num_tokens": 192944988.0, + "step": 7453 + }, + { + "epoch": 0.8185811552822315, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.478982448577881, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7270712852478027, + "num_tokens": 192967089.0, + "step": 7454 + }, + { + "epoch": 0.8186909729848452, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.411227226257324, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7023442983627319, + "num_tokens": 192993389.0, + "step": 7455 + }, + { + "epoch": 0.8188007906874588, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.52371883392334, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7073454856872559, + "num_tokens": 193016217.0, + "step": 7456 + }, + { + "epoch": 0.8189106083900725, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.312962055206299, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6936757564544678, + "num_tokens": 193043656.0, + "step": 7457 + }, + { + "epoch": 0.8190204260926861, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5479178428649902, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7071328163146973, + "num_tokens": 193066359.0, + "step": 7458 + }, + { + "epoch": 0.8191302437952998, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.244551420211792, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6878526210784912, + "num_tokens": 193096540.0, + "step": 7459 + }, + { + "epoch": 0.8192400614979135, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.514460802078247, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6997731924057007, + "num_tokens": 193119171.0, + "step": 7460 + }, + { + "epoch": 0.8193498792005272, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2135729789733887, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6974590420722961, + "num_tokens": 193149032.0, + "step": 7461 + }, + { + "epoch": 0.8194596969031408, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3121609687805176, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7067803740501404, + "num_tokens": 193174665.0, + "step": 7462 + }, + { + "epoch": 0.8195695146057544, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5246076583862305, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7063535451889038, + "num_tokens": 193198821.0, + "step": 7463 + }, + { + "epoch": 0.8196793323083681, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.51747727394104, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7487338185310364, + "num_tokens": 193219550.0, + "step": 7464 + }, + { + "epoch": 0.8197891500109817, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.295395851135254, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7162847518920898, + "num_tokens": 193245055.0, + "step": 7465 + }, + { + "epoch": 0.8198989677135954, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1347408294677734, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7133063673973083, + "num_tokens": 193274167.0, + "step": 7466 + }, + { + "epoch": 0.8200087854162091, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.509934902191162, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7188510894775391, + "num_tokens": 193297555.0, + "step": 7467 + }, + { + "epoch": 0.8201186031188228, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4081242084503174, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7060279846191406, + "num_tokens": 193324070.0, + "step": 7468 + }, + { + "epoch": 0.8202284208214364, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5200283527374268, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6970893144607544, + "num_tokens": 193347653.0, + "step": 7469 + }, + { + "epoch": 0.8203382385240501, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3755598068237305, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7071307897567749, + "num_tokens": 193372170.0, + "step": 7470 + }, + { + "epoch": 0.8204480562266637, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2956106662750244, + "learning_rate": 1e-06, + "loss": 1.1362, + "mean_token_accuracy": 0.6673611402511597, + "num_tokens": 193404706.0, + "step": 7471 + }, + { + "epoch": 0.8205578739292774, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2484583854675293, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7145348787307739, + "num_tokens": 193431115.0, + "step": 7472 + }, + { + "epoch": 0.820667691631891, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.342420816421509, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.691921055316925, + "num_tokens": 193458507.0, + "step": 7473 + }, + { + "epoch": 0.8207775093345048, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1688430309295654, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6982452869415283, + "num_tokens": 193488381.0, + "step": 7474 + }, + { + "epoch": 0.8208873270371184, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2487239837646484, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6927589178085327, + "num_tokens": 193519142.0, + "step": 7475 + }, + { + "epoch": 0.8209971447397321, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.347369909286499, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7130957841873169, + "num_tokens": 193543492.0, + "step": 7476 + }, + { + "epoch": 0.8211069624423457, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3489110469818115, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6965298056602478, + "num_tokens": 193567255.0, + "step": 7477 + }, + { + "epoch": 0.8212167801449594, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.60749888420105, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7078052163124084, + "num_tokens": 193588674.0, + "step": 7478 + }, + { + "epoch": 0.821326597847573, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2503669261932373, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6930456757545471, + "num_tokens": 193617385.0, + "step": 7479 + }, + { + "epoch": 0.8214364155501866, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.336865186691284, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7296661734580994, + "num_tokens": 193641889.0, + "step": 7480 + }, + { + "epoch": 0.8215462332528004, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.629640579223633, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.687942385673523, + "num_tokens": 193663586.0, + "step": 7481 + }, + { + "epoch": 0.821656050955414, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.572648763656616, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7049139142036438, + "num_tokens": 193685218.0, + "step": 7482 + }, + { + "epoch": 0.8217658686580277, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4449896812438965, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7159751653671265, + "num_tokens": 193708146.0, + "step": 7483 + }, + { + "epoch": 0.8218756863606413, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.63080096244812, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7087620496749878, + "num_tokens": 193727761.0, + "step": 7484 + }, + { + "epoch": 0.821985504063255, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3843255043029785, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6972350478172302, + "num_tokens": 193751345.0, + "step": 7485 + }, + { + "epoch": 0.8220953217658686, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.202360153198242, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7277184128761292, + "num_tokens": 193781026.0, + "step": 7486 + }, + { + "epoch": 0.8222051394684823, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3230791091918945, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7106415033340454, + "num_tokens": 193807025.0, + "step": 7487 + }, + { + "epoch": 0.8223149571710959, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2293701171875, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7301464080810547, + "num_tokens": 193831146.0, + "step": 7488 + }, + { + "epoch": 0.8224247748737097, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3374886512756348, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7267725467681885, + "num_tokens": 193856325.0, + "step": 7489 + }, + { + "epoch": 0.8225345925763233, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3416476249694824, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6951549053192139, + "num_tokens": 193881662.0, + "step": 7490 + }, + { + "epoch": 0.822644410278937, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2862660884857178, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6999372243881226, + "num_tokens": 193909299.0, + "step": 7491 + }, + { + "epoch": 0.8227542279815506, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.0362842082977295, + "learning_rate": 1e-06, + "loss": 1.1476, + "mean_token_accuracy": 0.6659644246101379, + "num_tokens": 193945491.0, + "step": 7492 + }, + { + "epoch": 0.8228640456841643, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.1513962745666504, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6960777044296265, + "num_tokens": 193972892.0, + "step": 7493 + }, + { + "epoch": 0.8229738633867779, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6009304523468018, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.694770336151123, + "num_tokens": 193994630.0, + "step": 7494 + }, + { + "epoch": 0.8230836810893916, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5564076900482178, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7233982086181641, + "num_tokens": 194016765.0, + "step": 7495 + }, + { + "epoch": 0.8231934987920053, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.291158676147461, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7104097604751587, + "num_tokens": 194041630.0, + "step": 7496 + }, + { + "epoch": 0.823303316494619, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.809812307357788, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7002962827682495, + "num_tokens": 194059924.0, + "step": 7497 + }, + { + "epoch": 0.8234131341972326, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.258934259414673, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.702970564365387, + "num_tokens": 194087610.0, + "step": 7498 + }, + { + "epoch": 0.8235229518998463, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2689690589904785, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7165124416351318, + "num_tokens": 194112974.0, + "step": 7499 + }, + { + "epoch": 0.8236327696024599, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.033799648284912, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7231711149215698, + "num_tokens": 194145084.0, + "step": 7500 + }, + { + "epoch": 0.8237425873050735, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2170073986053467, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.705097496509552, + "num_tokens": 194170949.0, + "step": 7501 + }, + { + "epoch": 0.8238524050076872, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.478191614151001, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.706673264503479, + "num_tokens": 194194584.0, + "step": 7502 + }, + { + "epoch": 0.823962222710301, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.386113405227661, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.699986457824707, + "num_tokens": 194219238.0, + "step": 7503 + }, + { + "epoch": 0.8240720404129146, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3044426441192627, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.715347170829773, + "num_tokens": 194245490.0, + "step": 7504 + }, + { + "epoch": 0.8241818581155282, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1464006900787354, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6944166421890259, + "num_tokens": 194277489.0, + "step": 7505 + }, + { + "epoch": 0.8242916758181419, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.185943365097046, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7242950201034546, + "num_tokens": 194304550.0, + "step": 7506 + }, + { + "epoch": 0.8244014935207555, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.454916477203369, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7196648716926575, + "num_tokens": 194328241.0, + "step": 7507 + }, + { + "epoch": 0.8245113112233692, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5051348209381104, + "learning_rate": 1e-06, + "loss": 1.1311, + "mean_token_accuracy": 0.6856083869934082, + "num_tokens": 194353809.0, + "step": 7508 + }, + { + "epoch": 0.8246211289259828, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6239888668060303, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7362210750579834, + "num_tokens": 194374441.0, + "step": 7509 + }, + { + "epoch": 0.8247309466285966, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.276968240737915, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7121143341064453, + "num_tokens": 194401876.0, + "step": 7510 + }, + { + "epoch": 0.8248407643312102, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5347774028778076, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7245808243751526, + "num_tokens": 194423064.0, + "step": 7511 + }, + { + "epoch": 0.8249505820338239, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3923840522766113, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7179263830184937, + "num_tokens": 194449636.0, + "step": 7512 + }, + { + "epoch": 0.8250603997364375, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.314548969268799, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7138869762420654, + "num_tokens": 194476248.0, + "step": 7513 + }, + { + "epoch": 0.8251702174390512, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2828516960144043, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6969071626663208, + "num_tokens": 194504609.0, + "step": 7514 + }, + { + "epoch": 0.8252800351416648, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3999252319335938, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7066977024078369, + "num_tokens": 194526147.0, + "step": 7515 + }, + { + "epoch": 0.8253898528442785, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 3.7945749759674072, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.698417067527771, + "num_tokens": 194552878.0, + "step": 7516 + }, + { + "epoch": 0.8254996705468921, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.1415157318115234, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6963711977005005, + "num_tokens": 194581847.0, + "step": 7517 + }, + { + "epoch": 0.8256094882495059, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.125067710876465, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7038143873214722, + "num_tokens": 194610977.0, + "step": 7518 + }, + { + "epoch": 0.8257193059521195, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3727519512176514, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7512389421463013, + "num_tokens": 194633139.0, + "step": 7519 + }, + { + "epoch": 0.8258291236547332, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.405491828918457, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6950892806053162, + "num_tokens": 194658417.0, + "step": 7520 + }, + { + "epoch": 0.8259389413573468, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.452871561050415, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7028822302818298, + "num_tokens": 194682723.0, + "step": 7521 + }, + { + "epoch": 0.8260487590599604, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3890299797058105, + "learning_rate": 1e-06, + "loss": 1.1265, + "mean_token_accuracy": 0.6711969375610352, + "num_tokens": 194709719.0, + "step": 7522 + }, + { + "epoch": 0.8261585767625741, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4344077110290527, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7193132042884827, + "num_tokens": 194735545.0, + "step": 7523 + }, + { + "epoch": 0.8262683944651877, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.229929208755493, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.707581102848053, + "num_tokens": 194764463.0, + "step": 7524 + }, + { + "epoch": 0.8263782121678015, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.454181671142578, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.7013142108917236, + "num_tokens": 194788477.0, + "step": 7525 + }, + { + "epoch": 0.8264880298704151, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.197223424911499, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7097310423851013, + "num_tokens": 194818075.0, + "step": 7526 + }, + { + "epoch": 0.8265978475730288, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.263387680053711, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7067350149154663, + "num_tokens": 194844525.0, + "step": 7527 + }, + { + "epoch": 0.8267076652756424, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.09641170501709, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6881327629089355, + "num_tokens": 194875526.0, + "step": 7528 + }, + { + "epoch": 0.8268174829782561, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2124271392822266, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7025936841964722, + "num_tokens": 194902455.0, + "step": 7529 + }, + { + "epoch": 0.8269273006808697, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.411802291870117, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7245302200317383, + "num_tokens": 194924955.0, + "step": 7530 + }, + { + "epoch": 0.8270371183834834, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.049103260040283, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7134947776794434, + "num_tokens": 194956923.0, + "step": 7531 + }, + { + "epoch": 0.8271469360860971, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.345116376876831, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7077875137329102, + "num_tokens": 194982918.0, + "step": 7532 + }, + { + "epoch": 0.8272567537887108, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3871021270751953, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6933237314224243, + "num_tokens": 195006390.0, + "step": 7533 + }, + { + "epoch": 0.8273665714913244, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3653564453125, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7184144258499146, + "num_tokens": 195030450.0, + "step": 7534 + }, + { + "epoch": 0.8274763891939381, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3143222332000732, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6832983493804932, + "num_tokens": 195057157.0, + "step": 7535 + }, + { + "epoch": 0.8275862068965517, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 3.8166630268096924, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.686760425567627, + "num_tokens": 195086748.0, + "step": 7536 + }, + { + "epoch": 0.8276960245991654, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1347358226776123, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7017379999160767, + "num_tokens": 195116699.0, + "step": 7537 + }, + { + "epoch": 0.827805842301779, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2108662128448486, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7072554230690002, + "num_tokens": 195144108.0, + "step": 7538 + }, + { + "epoch": 0.8279156600043928, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.417860984802246, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7094530463218689, + "num_tokens": 195167984.0, + "step": 7539 + }, + { + "epoch": 0.8280254777070064, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.7171080112457275, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.737883448600769, + "num_tokens": 195186092.0, + "step": 7540 + }, + { + "epoch": 0.82813529540962, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2586867809295654, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7029967308044434, + "num_tokens": 195213490.0, + "step": 7541 + }, + { + "epoch": 0.8282451131122337, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6305124759674072, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7034688591957092, + "num_tokens": 195235215.0, + "step": 7542 + }, + { + "epoch": 0.8283549308148473, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.128396511077881, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6858349442481995, + "num_tokens": 195268719.0, + "step": 7543 + }, + { + "epoch": 0.828464748517461, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.209852457046509, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6846728324890137, + "num_tokens": 195302496.0, + "step": 7544 + }, + { + "epoch": 0.8285745662200746, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.390366554260254, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.7011710405349731, + "num_tokens": 195328694.0, + "step": 7545 + }, + { + "epoch": 0.8286843839226883, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3521170616149902, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7137457728385925, + "num_tokens": 195352880.0, + "step": 7546 + }, + { + "epoch": 0.828794201625302, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.0657427310943604, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7331736087799072, + "num_tokens": 195381758.0, + "step": 7547 + }, + { + "epoch": 0.8289040193279157, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4262595176696777, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6905659437179565, + "num_tokens": 195406281.0, + "step": 7548 + }, + { + "epoch": 0.8290138370305293, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5400002002716064, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6932439804077148, + "num_tokens": 195431651.0, + "step": 7549 + }, + { + "epoch": 0.829123654733143, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2084171772003174, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.709674596786499, + "num_tokens": 195460148.0, + "step": 7550 + }, + { + "epoch": 0.8292334724357566, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2904703617095947, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7436215281486511, + "num_tokens": 195484865.0, + "step": 7551 + }, + { + "epoch": 0.8293432901383703, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3366360664367676, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7091646790504456, + "num_tokens": 195510871.0, + "step": 7552 + }, + { + "epoch": 0.8294531078409839, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.261298656463623, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6933258175849915, + "num_tokens": 195540329.0, + "step": 7553 + }, + { + "epoch": 0.8295629255435977, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4101431369781494, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7250044345855713, + "num_tokens": 195563569.0, + "step": 7554 + }, + { + "epoch": 0.8296727432462113, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.7188422679901123, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7058324813842773, + "num_tokens": 195584615.0, + "step": 7555 + }, + { + "epoch": 0.829782560948825, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.191310167312622, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6828011274337769, + "num_tokens": 195612512.0, + "step": 7556 + }, + { + "epoch": 0.8298923786514386, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.779524564743042, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7137400507926941, + "num_tokens": 195632874.0, + "step": 7557 + }, + { + "epoch": 0.8300021963540523, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.5814998149871826, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.6916650533676147, + "num_tokens": 195655467.0, + "step": 7558 + }, + { + "epoch": 0.8301120140566659, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.035550355911255, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.7033848762512207, + "num_tokens": 195687886.0, + "step": 7559 + }, + { + "epoch": 0.8302218317592795, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.442472219467163, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.7026699781417847, + "num_tokens": 195711794.0, + "step": 7560 + }, + { + "epoch": 0.8303316494618933, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1941280364990234, + "learning_rate": 1e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.6756755113601685, + "num_tokens": 195742931.0, + "step": 7561 + }, + { + "epoch": 0.830441467164507, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.7076964378356934, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7136942148208618, + "num_tokens": 195762619.0, + "step": 7562 + }, + { + "epoch": 0.8305512848671206, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.450432777404785, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.725799560546875, + "num_tokens": 195784610.0, + "step": 7563 + }, + { + "epoch": 0.8306611025697342, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3657913208007812, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7015477418899536, + "num_tokens": 195810897.0, + "step": 7564 + }, + { + "epoch": 0.8307709202723479, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.044335126876831, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6843373775482178, + "num_tokens": 195846421.0, + "step": 7565 + }, + { + "epoch": 0.8308807379749615, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5677080154418945, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6980739831924438, + "num_tokens": 195868398.0, + "step": 7566 + }, + { + "epoch": 0.8309905556775752, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5758321285247803, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6965097188949585, + "num_tokens": 195889234.0, + "step": 7567 + }, + { + "epoch": 0.8311003733801889, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.792262077331543, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.701396107673645, + "num_tokens": 195908887.0, + "step": 7568 + }, + { + "epoch": 0.8312101910828026, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6396024227142334, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.688910186290741, + "num_tokens": 195932480.0, + "step": 7569 + }, + { + "epoch": 0.8313200087854162, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.331864833831787, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.7010515928268433, + "num_tokens": 195959682.0, + "step": 7570 + }, + { + "epoch": 0.8314298264880299, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1360089778900146, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6857332587242126, + "num_tokens": 195992913.0, + "step": 7571 + }, + { + "epoch": 0.8315396441906435, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.617940902709961, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7084476947784424, + "num_tokens": 196013906.0, + "step": 7572 + }, + { + "epoch": 0.8316494618932572, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5574049949645996, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6833580732345581, + "num_tokens": 196039076.0, + "step": 7573 + }, + { + "epoch": 0.8317592795958708, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2388086318969727, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6997518539428711, + "num_tokens": 196070521.0, + "step": 7574 + }, + { + "epoch": 0.8318690972984845, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.291299819946289, + "learning_rate": 1e-06, + "loss": 1.1069, + "mean_token_accuracy": 0.6754738092422485, + "num_tokens": 196097730.0, + "step": 7575 + }, + { + "epoch": 0.8319789150010982, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.540752649307251, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7355225086212158, + "num_tokens": 196117974.0, + "step": 7576 + }, + { + "epoch": 0.8320887327037119, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2424123287200928, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7012075185775757, + "num_tokens": 196146404.0, + "step": 7577 + }, + { + "epoch": 0.8321985504063255, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3176679611206055, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7122140526771545, + "num_tokens": 196171219.0, + "step": 7578 + }, + { + "epoch": 0.8323083681089392, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.7015223503112793, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6820247173309326, + "num_tokens": 196194458.0, + "step": 7579 + }, + { + "epoch": 0.8324181858115528, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.290977954864502, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7231592535972595, + "num_tokens": 196219471.0, + "step": 7580 + }, + { + "epoch": 0.8325280035141664, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3356640338897705, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6922152042388916, + "num_tokens": 196243947.0, + "step": 7581 + }, + { + "epoch": 0.8326378212167801, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.355863094329834, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6990212202072144, + "num_tokens": 196270203.0, + "step": 7582 + }, + { + "epoch": 0.8327476389193939, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3548636436462402, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7092901468276978, + "num_tokens": 196297066.0, + "step": 7583 + }, + { + "epoch": 0.8328574566220075, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.534417152404785, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7111732363700867, + "num_tokens": 196318379.0, + "step": 7584 + }, + { + "epoch": 0.8329672743246211, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6453957557678223, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7148175239562988, + "num_tokens": 196338137.0, + "step": 7585 + }, + { + "epoch": 0.8330770920272348, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2417051792144775, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.695233166217804, + "num_tokens": 196367424.0, + "step": 7586 + }, + { + "epoch": 0.8331869097298484, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.3184876441955566, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.713016152381897, + "num_tokens": 196391314.0, + "step": 7587 + }, + { + "epoch": 0.8332967274324621, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.3299291133880615, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6980094909667969, + "num_tokens": 196416374.0, + "step": 7588 + }, + { + "epoch": 0.8334065451350757, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.2951555252075195, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7274069786071777, + "num_tokens": 196441639.0, + "step": 7589 + }, + { + "epoch": 0.8335163628376895, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.081655263900757, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7112760543823242, + "num_tokens": 196471450.0, + "step": 7590 + }, + { + "epoch": 0.8336261805403031, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.175255060195923, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6877098083496094, + "num_tokens": 196500994.0, + "step": 7591 + }, + { + "epoch": 0.8337359982429168, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.286750316619873, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.710051417350769, + "num_tokens": 196526947.0, + "step": 7592 + }, + { + "epoch": 0.8338458159455304, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.546452760696411, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7035879492759705, + "num_tokens": 196550249.0, + "step": 7593 + }, + { + "epoch": 0.8339556336481441, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.38861346244812, + "learning_rate": 1e-06, + "loss": 1.1113, + "mean_token_accuracy": 0.6827690601348877, + "num_tokens": 196577281.0, + "step": 7594 + }, + { + "epoch": 0.8340654513507577, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3006210327148438, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7061832547187805, + "num_tokens": 196602095.0, + "step": 7595 + }, + { + "epoch": 0.8341752690533714, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2242238521575928, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6928772330284119, + "num_tokens": 196629629.0, + "step": 7596 + }, + { + "epoch": 0.8342850867559851, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.6393043994903564, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7033346891403198, + "num_tokens": 196652009.0, + "step": 7597 + }, + { + "epoch": 0.8343949044585988, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4966719150543213, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7205467224121094, + "num_tokens": 196677268.0, + "step": 7598 + }, + { + "epoch": 0.8345047221612124, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.408811330795288, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6926461458206177, + "num_tokens": 196703110.0, + "step": 7599 + }, + { + "epoch": 0.834614539863826, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.0034420490264893, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6987747550010681, + "num_tokens": 196735845.0, + "step": 7600 + }, + { + "epoch": 0.8347243575664397, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.244743824005127, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7001621127128601, + "num_tokens": 196764382.0, + "step": 7601 + }, + { + "epoch": 0.8348341752690533, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4811313152313232, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7201412916183472, + "num_tokens": 196785955.0, + "step": 7602 + }, + { + "epoch": 0.834943992971667, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2356503009796143, + "learning_rate": 1e-06, + "loss": 1.1298, + "mean_token_accuracy": 0.671851634979248, + "num_tokens": 196815121.0, + "step": 7603 + }, + { + "epoch": 0.8350538106742806, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.0618224143981934, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7112205028533936, + "num_tokens": 196844968.0, + "step": 7604 + }, + { + "epoch": 0.8351636283768944, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2628631591796875, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7002148628234863, + "num_tokens": 196872347.0, + "step": 7605 + }, + { + "epoch": 0.835273446079508, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4400863647460938, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6948032379150391, + "num_tokens": 196896899.0, + "step": 7606 + }, + { + "epoch": 0.8353832637821217, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.282917022705078, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.7012427449226379, + "num_tokens": 196925898.0, + "step": 7607 + }, + { + "epoch": 0.8354930814847353, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.393223285675049, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6968894004821777, + "num_tokens": 196949512.0, + "step": 7608 + }, + { + "epoch": 0.835602899187349, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.745818853378296, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7212727069854736, + "num_tokens": 196969131.0, + "step": 7609 + }, + { + "epoch": 0.8357127168899626, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.2842183113098145, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6901261210441589, + "num_tokens": 196996637.0, + "step": 7610 + }, + { + "epoch": 0.8358225345925763, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.607468366622925, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.7070708870887756, + "num_tokens": 197017944.0, + "step": 7611 + }, + { + "epoch": 0.83593235229519, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.5625064373016357, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7152093648910522, + "num_tokens": 197039417.0, + "step": 7612 + }, + { + "epoch": 0.8360421699978037, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.377227783203125, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7088086605072021, + "num_tokens": 197063264.0, + "step": 7613 + }, + { + "epoch": 0.8361519877004173, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.4936883449554443, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7265231609344482, + "num_tokens": 197085752.0, + "step": 7614 + }, + { + "epoch": 0.836261805403031, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.1572377681732178, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6902137398719788, + "num_tokens": 197114138.0, + "step": 7615 + }, + { + "epoch": 0.8363716231056446, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.5248160362243652, + "learning_rate": 1e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.6802998781204224, + "num_tokens": 197136693.0, + "step": 7616 + }, + { + "epoch": 0.8364814408082583, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.2985713481903076, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7216806411743164, + "num_tokens": 197161869.0, + "step": 7617 + }, + { + "epoch": 0.8365912585108719, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.1998579502105713, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.712425708770752, + "num_tokens": 197192424.0, + "step": 7618 + }, + { + "epoch": 0.8367010762134857, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.131859064102173, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7081388831138611, + "num_tokens": 197221800.0, + "step": 7619 + }, + { + "epoch": 0.8368108939160993, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.313969135284424, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7054731845855713, + "num_tokens": 197249437.0, + "step": 7620 + }, + { + "epoch": 0.836920711618713, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.4923787117004395, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7133080959320068, + "num_tokens": 197271744.0, + "step": 7621 + }, + { + "epoch": 0.8370305293213266, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.403334379196167, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7077633142471313, + "num_tokens": 197295207.0, + "step": 7622 + }, + { + "epoch": 0.8371403470239402, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3847084045410156, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7065316438674927, + "num_tokens": 197319632.0, + "step": 7623 + }, + { + "epoch": 0.8372501647265539, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.309941530227661, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.6822025775909424, + "num_tokens": 197350363.0, + "step": 7624 + }, + { + "epoch": 0.8373599824291675, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3171865940093994, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7199841737747192, + "num_tokens": 197375405.0, + "step": 7625 + }, + { + "epoch": 0.8374698001317813, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.360269546508789, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6914505362510681, + "num_tokens": 197401800.0, + "step": 7626 + }, + { + "epoch": 0.8375796178343949, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.469747543334961, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6907018423080444, + "num_tokens": 197425361.0, + "step": 7627 + }, + { + "epoch": 0.8376894355370086, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.287753105163574, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7091227173805237, + "num_tokens": 197451855.0, + "step": 7628 + }, + { + "epoch": 0.8377992532396222, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.5021140575408936, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6888014078140259, + "num_tokens": 197475433.0, + "step": 7629 + }, + { + "epoch": 0.8379090709422359, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.749354839324951, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7118916511535645, + "num_tokens": 197495708.0, + "step": 7630 + }, + { + "epoch": 0.8380188886448495, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.476189613342285, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6926270723342896, + "num_tokens": 197529731.0, + "step": 7631 + }, + { + "epoch": 0.8381287063474632, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3303163051605225, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6862877011299133, + "num_tokens": 197555343.0, + "step": 7632 + }, + { + "epoch": 0.8382385240500769, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.1182096004486084, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.712165117263794, + "num_tokens": 197585097.0, + "step": 7633 + }, + { + "epoch": 0.8383483417526906, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3650248050689697, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7367879152297974, + "num_tokens": 197610696.0, + "step": 7634 + }, + { + "epoch": 0.8384581594553042, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.291790246963501, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6974939703941345, + "num_tokens": 197638076.0, + "step": 7635 + }, + { + "epoch": 0.8385679771579179, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.702664852142334, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7064283490180969, + "num_tokens": 197659044.0, + "step": 7636 + }, + { + "epoch": 0.8386777948605315, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.328113317489624, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7035666108131409, + "num_tokens": 197684359.0, + "step": 7637 + }, + { + "epoch": 0.8387876125631452, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.606018304824829, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7062410116195679, + "num_tokens": 197706907.0, + "step": 7638 + }, + { + "epoch": 0.8388974302657588, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.2376935482025146, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7042523622512817, + "num_tokens": 197734448.0, + "step": 7639 + }, + { + "epoch": 0.8390072479683724, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.2667722702026367, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.704485297203064, + "num_tokens": 197760172.0, + "step": 7640 + }, + { + "epoch": 0.8391170656709862, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.359804630279541, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7032950520515442, + "num_tokens": 197786884.0, + "step": 7641 + }, + { + "epoch": 0.8392268833735999, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.454684257507324, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7074501514434814, + "num_tokens": 197809095.0, + "step": 7642 + }, + { + "epoch": 0.8393367010762135, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.3342370986938477, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7094650268554688, + "num_tokens": 197835389.0, + "step": 7643 + }, + { + "epoch": 0.8394465187788271, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.6487274169921875, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7297197580337524, + "num_tokens": 197857238.0, + "step": 7644 + }, + { + "epoch": 0.8395563364814408, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.0342578887939453, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6936087012290955, + "num_tokens": 197891332.0, + "step": 7645 + }, + { + "epoch": 0.8396661541840544, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.360879898071289, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6889244914054871, + "num_tokens": 197917516.0, + "step": 7646 + }, + { + "epoch": 0.8397759718866681, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.6172587871551514, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7178626656532288, + "num_tokens": 197937808.0, + "step": 7647 + }, + { + "epoch": 0.8398857895892818, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.1263110637664795, + "learning_rate": 1e-06, + "loss": 1.1249, + "mean_token_accuracy": 0.6693862080574036, + "num_tokens": 197970058.0, + "step": 7648 + }, + { + "epoch": 0.8399956072918955, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.1863698959350586, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6902229189872742, + "num_tokens": 198000588.0, + "step": 7649 + }, + { + "epoch": 0.8401054249945091, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.1424174308776855, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6940094232559204, + "num_tokens": 198030599.0, + "step": 7650 + }, + { + "epoch": 0.8402152426971228, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3210182189941406, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.707214891910553, + "num_tokens": 198055412.0, + "step": 7651 + }, + { + "epoch": 0.8403250603997364, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.3158156871795654, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.720244288444519, + "num_tokens": 198079265.0, + "step": 7652 + }, + { + "epoch": 0.8404348781023501, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 2.688894510269165, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7095825672149658, + "num_tokens": 198099381.0, + "step": 7653 + }, + { + "epoch": 0.8405446958049637, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.428199291229248, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6991214752197266, + "num_tokens": 198122592.0, + "step": 7654 + }, + { + "epoch": 0.8406545135075775, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.5046017169952393, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7070561647415161, + "num_tokens": 198146233.0, + "step": 7655 + }, + { + "epoch": 0.8407643312101911, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.4689950942993164, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7086106538772583, + "num_tokens": 198170128.0, + "step": 7656 + }, + { + "epoch": 0.8408741489128048, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.2901482582092285, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7243516445159912, + "num_tokens": 198195310.0, + "step": 7657 + }, + { + "epoch": 0.8409839666154184, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.286848306655884, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.699043869972229, + "num_tokens": 198221303.0, + "step": 7658 + }, + { + "epoch": 0.8410937843180321, + "ewc_loss": 1.4066696166992188e-05, + "grad_norm": 2.2852799892425537, + "learning_rate": 1e-06, + "loss": 1.1061, + "mean_token_accuracy": 0.6856319308280945, + "num_tokens": 198247562.0, + "step": 7659 + }, + { + "epoch": 0.8412036020206457, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.399569272994995, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6863609552383423, + "num_tokens": 198276906.0, + "step": 7660 + }, + { + "epoch": 0.8413134197232593, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 2.700533628463745, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.708362877368927, + "num_tokens": 198295104.0, + "step": 7661 + }, + { + "epoch": 0.8414232374258731, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2689502239227295, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6987143754959106, + "num_tokens": 198323331.0, + "step": 7662 + }, + { + "epoch": 0.8415330551284868, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.329828977584839, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7017971277236938, + "num_tokens": 198350668.0, + "step": 7663 + }, + { + "epoch": 0.8416428728311004, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.0830459594726562, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7118743062019348, + "num_tokens": 198381292.0, + "step": 7664 + }, + { + "epoch": 0.841752690533714, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.378626585006714, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7085403203964233, + "num_tokens": 198406274.0, + "step": 7665 + }, + { + "epoch": 0.8418625082363277, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3578193187713623, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6941640377044678, + "num_tokens": 198430130.0, + "step": 7666 + }, + { + "epoch": 0.8419723259389413, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.072659969329834, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7233251333236694, + "num_tokens": 198459777.0, + "step": 7667 + }, + { + "epoch": 0.842082143641555, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.426725387573242, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6974735260009766, + "num_tokens": 198485192.0, + "step": 7668 + }, + { + "epoch": 0.8421919613441686, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.68710994720459, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.725908100605011, + "num_tokens": 198503231.0, + "step": 7669 + }, + { + "epoch": 0.8423017790467824, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4502646923065186, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7169554233551025, + "num_tokens": 198528863.0, + "step": 7670 + }, + { + "epoch": 0.842411596749396, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.352592945098877, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7138147354125977, + "num_tokens": 198553857.0, + "step": 7671 + }, + { + "epoch": 0.8425214144520097, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.329078435897827, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.696495532989502, + "num_tokens": 198583558.0, + "step": 7672 + }, + { + "epoch": 0.8426312321546233, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3865065574645996, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7252185344696045, + "num_tokens": 198609291.0, + "step": 7673 + }, + { + "epoch": 0.842741049857237, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3595545291900635, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7093657851219177, + "num_tokens": 198636900.0, + "step": 7674 + }, + { + "epoch": 0.8428508675598506, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.153689384460449, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6894059181213379, + "num_tokens": 198668433.0, + "step": 7675 + }, + { + "epoch": 0.8429606852624643, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3302924633026123, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7123250961303711, + "num_tokens": 198694562.0, + "step": 7676 + }, + { + "epoch": 0.843070502965078, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.4915292263031006, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7028864622116089, + "num_tokens": 198717369.0, + "step": 7677 + }, + { + "epoch": 0.8431803206676917, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.289964199066162, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7082172632217407, + "num_tokens": 198744856.0, + "step": 7678 + }, + { + "epoch": 0.8432901383703053, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.276698589324951, + "learning_rate": 1e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6852858066558838, + "num_tokens": 198771538.0, + "step": 7679 + }, + { + "epoch": 0.843399956072919, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.486041307449341, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6877428293228149, + "num_tokens": 198796885.0, + "step": 7680 + }, + { + "epoch": 0.8435097737755326, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.5541999340057373, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7128769159317017, + "num_tokens": 198818940.0, + "step": 7681 + }, + { + "epoch": 0.8436195914781462, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.274710178375244, + "learning_rate": 1e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.6726114153862, + "num_tokens": 198845612.0, + "step": 7682 + }, + { + "epoch": 0.8437294091807599, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.1680190563201904, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7068301439285278, + "num_tokens": 198874429.0, + "step": 7683 + }, + { + "epoch": 0.8438392268833736, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.5718741416931152, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7091454267501831, + "num_tokens": 198899257.0, + "step": 7684 + }, + { + "epoch": 0.8439490445859873, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.455963134765625, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7121220827102661, + "num_tokens": 198923303.0, + "step": 7685 + }, + { + "epoch": 0.8440588622886009, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4406495094299316, + "learning_rate": 1e-06, + "loss": 1.1556, + "mean_token_accuracy": 0.6594474911689758, + "num_tokens": 198953795.0, + "step": 7686 + }, + { + "epoch": 0.8441686799912146, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2675492763519287, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7000085115432739, + "num_tokens": 198981725.0, + "step": 7687 + }, + { + "epoch": 0.8442784976938282, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.382270336151123, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7094871401786804, + "num_tokens": 199005153.0, + "step": 7688 + }, + { + "epoch": 0.8443883153964419, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 2.5851497650146484, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7231117486953735, + "num_tokens": 199027042.0, + "step": 7689 + }, + { + "epoch": 0.8444981330990555, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.520930051803589, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.710052490234375, + "num_tokens": 199050146.0, + "step": 7690 + }, + { + "epoch": 0.8446079508016693, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.50272798538208, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6949335336685181, + "num_tokens": 199073404.0, + "step": 7691 + }, + { + "epoch": 0.8447177685042829, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.261096715927124, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6970406174659729, + "num_tokens": 199101008.0, + "step": 7692 + }, + { + "epoch": 0.8448275862068966, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.390242338180542, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6918957829475403, + "num_tokens": 199126165.0, + "step": 7693 + }, + { + "epoch": 0.8449374039095102, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3453989028930664, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7023965120315552, + "num_tokens": 199153533.0, + "step": 7694 + }, + { + "epoch": 0.8450472216121239, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2400970458984375, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7346261143684387, + "num_tokens": 199179424.0, + "step": 7695 + }, + { + "epoch": 0.8451570393147375, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.5734381675720215, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6981081962585449, + "num_tokens": 199202764.0, + "step": 7696 + }, + { + "epoch": 0.8452668570173512, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4252421855926514, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7125158309936523, + "num_tokens": 199227940.0, + "step": 7697 + }, + { + "epoch": 0.8453766747199648, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3781301975250244, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.707582950592041, + "num_tokens": 199252255.0, + "step": 7698 + }, + { + "epoch": 0.8454864924225786, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2158002853393555, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7109366059303284, + "num_tokens": 199278817.0, + "step": 7699 + }, + { + "epoch": 0.8455963101251922, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.348066568374634, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7073872089385986, + "num_tokens": 199304937.0, + "step": 7700 + }, + { + "epoch": 0.8457061278278059, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.012739896774292, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7281002402305603, + "num_tokens": 199337317.0, + "step": 7701 + }, + { + "epoch": 0.8458159455304195, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3565056324005127, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7055298686027527, + "num_tokens": 199363824.0, + "step": 7702 + }, + { + "epoch": 0.8459257632330331, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3489086627960205, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7028895616531372, + "num_tokens": 199388320.0, + "step": 7703 + }, + { + "epoch": 0.8460355809356468, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3938889503479004, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6959384679794312, + "num_tokens": 199415523.0, + "step": 7704 + }, + { + "epoch": 0.8461453986382604, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.469442367553711, + "learning_rate": 1e-06, + "loss": 1.1264, + "mean_token_accuracy": 0.6847599744796753, + "num_tokens": 199441645.0, + "step": 7705 + }, + { + "epoch": 0.8462552163408742, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.6141743659973145, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7171970009803772, + "num_tokens": 199462863.0, + "step": 7706 + }, + { + "epoch": 0.8463650340434878, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.546037197113037, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.726571261882782, + "num_tokens": 199484387.0, + "step": 7707 + }, + { + "epoch": 0.8464748517461015, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.256288528442383, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7003199458122253, + "num_tokens": 199513679.0, + "step": 7708 + }, + { + "epoch": 0.8465846694487151, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.403799533843994, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6859787702560425, + "num_tokens": 199542808.0, + "step": 7709 + }, + { + "epoch": 0.8466944871513288, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.5290675163269043, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.688255786895752, + "num_tokens": 199566968.0, + "step": 7710 + }, + { + "epoch": 0.8468043048539424, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2949585914611816, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7155606150627136, + "num_tokens": 199593692.0, + "step": 7711 + }, + { + "epoch": 0.8469141225565561, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.167052745819092, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7010154128074646, + "num_tokens": 199622402.0, + "step": 7712 + }, + { + "epoch": 0.8470239402591698, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.8047382831573486, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7157517671585083, + "num_tokens": 199641522.0, + "step": 7713 + }, + { + "epoch": 0.8471337579617835, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.267591714859009, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7066816091537476, + "num_tokens": 199667927.0, + "step": 7714 + }, + { + "epoch": 0.8472435756643971, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.496098041534424, + "learning_rate": 1e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.6869425773620605, + "num_tokens": 199693552.0, + "step": 7715 + }, + { + "epoch": 0.8473533933670108, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.427898645401001, + "learning_rate": 1e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.679653525352478, + "num_tokens": 199716886.0, + "step": 7716 + }, + { + "epoch": 0.8474632110696244, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.536254644393921, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7082535028457642, + "num_tokens": 199737928.0, + "step": 7717 + }, + { + "epoch": 0.8475730287722381, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4895527362823486, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7079724073410034, + "num_tokens": 199766914.0, + "step": 7718 + }, + { + "epoch": 0.8476828464748517, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.151870012283325, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7058345079421997, + "num_tokens": 199797223.0, + "step": 7719 + }, + { + "epoch": 0.8477926641774655, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.410783052444458, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7334302067756653, + "num_tokens": 199819060.0, + "step": 7720 + }, + { + "epoch": 0.8479024818800791, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3607442378997803, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.71170574426651, + "num_tokens": 199845241.0, + "step": 7721 + }, + { + "epoch": 0.8480122995826928, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3116776943206787, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7235800623893738, + "num_tokens": 199868686.0, + "step": 7722 + }, + { + "epoch": 0.8481221172853064, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.606961488723755, + "learning_rate": 1e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6798638105392456, + "num_tokens": 199892312.0, + "step": 7723 + }, + { + "epoch": 0.84823193498792, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3916282653808594, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7072988152503967, + "num_tokens": 199917331.0, + "step": 7724 + }, + { + "epoch": 0.8483417526905337, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.328408718109131, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.706529974937439, + "num_tokens": 199942069.0, + "step": 7725 + }, + { + "epoch": 0.8484515703931473, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3033244609832764, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6906630992889404, + "num_tokens": 199967921.0, + "step": 7726 + }, + { + "epoch": 0.848561388095761, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.245682716369629, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7172126770019531, + "num_tokens": 199993258.0, + "step": 7727 + }, + { + "epoch": 0.8486712057983747, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.1278207302093506, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6731047630310059, + "num_tokens": 200025395.0, + "step": 7728 + }, + { + "epoch": 0.8487810235009884, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.691088914871216, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6911548376083374, + "num_tokens": 200047088.0, + "step": 7729 + }, + { + "epoch": 0.848890841203602, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2992498874664307, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7132153511047363, + "num_tokens": 200073044.0, + "step": 7730 + }, + { + "epoch": 0.8490006589062157, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.1811435222625732, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7009433507919312, + "num_tokens": 200103693.0, + "step": 7731 + }, + { + "epoch": 0.8491104766088293, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2520487308502197, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7140713930130005, + "num_tokens": 200129605.0, + "step": 7732 + }, + { + "epoch": 0.849220294311443, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.232149839401245, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6979260444641113, + "num_tokens": 200159419.0, + "step": 7733 + }, + { + "epoch": 0.8493301120140566, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4668970108032227, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6824221014976501, + "num_tokens": 200183858.0, + "step": 7734 + }, + { + "epoch": 0.8494399297166704, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.1773521900177, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6838948726654053, + "num_tokens": 200215080.0, + "step": 7735 + }, + { + "epoch": 0.849549747419284, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.5311689376831055, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6883409023284912, + "num_tokens": 200236944.0, + "step": 7736 + }, + { + "epoch": 0.8496595651218977, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2849695682525635, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6998029947280884, + "num_tokens": 200265185.0, + "step": 7737 + }, + { + "epoch": 0.8497693828245113, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.13454008102417, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.703947901725769, + "num_tokens": 200293716.0, + "step": 7738 + }, + { + "epoch": 0.849879200527125, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3541786670684814, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6992298364639282, + "num_tokens": 200318403.0, + "step": 7739 + }, + { + "epoch": 0.8499890182297386, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2452871799468994, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.730429470539093, + "num_tokens": 200343584.0, + "step": 7740 + }, + { + "epoch": 0.8500988359323522, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.064507007598877, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7030110359191895, + "num_tokens": 200375251.0, + "step": 7741 + }, + { + "epoch": 0.850208653634966, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.120408773422241, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6901025772094727, + "num_tokens": 200406054.0, + "step": 7742 + }, + { + "epoch": 0.8503184713375797, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.6556596755981445, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.7052263617515564, + "num_tokens": 200427875.0, + "step": 7743 + }, + { + "epoch": 0.8504282890401933, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2553813457489014, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7234687209129333, + "num_tokens": 200454987.0, + "step": 7744 + }, + { + "epoch": 0.8505381067428069, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3452794551849365, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7281574010848999, + "num_tokens": 200478848.0, + "step": 7745 + }, + { + "epoch": 0.8506479244454206, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.309109926223755, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7131468653678894, + "num_tokens": 200502938.0, + "step": 7746 + }, + { + "epoch": 0.8507577421480342, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.343888759613037, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7270398139953613, + "num_tokens": 200527962.0, + "step": 7747 + }, + { + "epoch": 0.8508675598506479, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.323326349258423, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.703083872795105, + "num_tokens": 200554384.0, + "step": 7748 + }, + { + "epoch": 0.8509773775532616, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3069818019866943, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6950744986534119, + "num_tokens": 200580256.0, + "step": 7749 + }, + { + "epoch": 0.8510871952558753, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.5411581993103027, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6993204355239868, + "num_tokens": 200602913.0, + "step": 7750 + }, + { + "epoch": 0.8511970129584889, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.1509034633636475, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6938518285751343, + "num_tokens": 200637997.0, + "step": 7751 + }, + { + "epoch": 0.8513068306611026, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4790961742401123, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7162420749664307, + "num_tokens": 200663680.0, + "step": 7752 + }, + { + "epoch": 0.8514166483637162, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2659130096435547, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7213578820228577, + "num_tokens": 200687698.0, + "step": 7753 + }, + { + "epoch": 0.8515264660663299, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.2821178436279297, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6908757090568542, + "num_tokens": 200718240.0, + "step": 7754 + }, + { + "epoch": 0.8516362837689435, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4099411964416504, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6817464828491211, + "num_tokens": 200743003.0, + "step": 7755 + }, + { + "epoch": 0.8517461014715572, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.276463747024536, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6827849745750427, + "num_tokens": 200772187.0, + "step": 7756 + }, + { + "epoch": 0.8518559191741709, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4491450786590576, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.720129668712616, + "num_tokens": 200800494.0, + "step": 7757 + }, + { + "epoch": 0.8519657368767846, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.613192081451416, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.718026340007782, + "num_tokens": 200820127.0, + "step": 7758 + }, + { + "epoch": 0.8520755545793982, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.8820741176605225, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7323633432388306, + "num_tokens": 200838496.0, + "step": 7759 + }, + { + "epoch": 0.8521853722820119, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4969046115875244, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7025878429412842, + "num_tokens": 200863720.0, + "step": 7760 + }, + { + "epoch": 0.8522951899846255, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.3794593811035156, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7054583430290222, + "num_tokens": 200890496.0, + "step": 7761 + }, + { + "epoch": 0.8524050076872391, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.231816530227661, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7099571228027344, + "num_tokens": 200920864.0, + "step": 7762 + }, + { + "epoch": 0.8525148253898528, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.5012636184692383, + "learning_rate": 1e-06, + "loss": 1.0896, + "mean_token_accuracy": 0.6799225807189941, + "num_tokens": 200945141.0, + "step": 7763 + }, + { + "epoch": 0.8526246430924665, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.361046075820923, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7060588598251343, + "num_tokens": 200973178.0, + "step": 7764 + }, + { + "epoch": 0.8527344607950802, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.4285635948181152, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.71607905626297, + "num_tokens": 200996047.0, + "step": 7765 + }, + { + "epoch": 0.8528442784976938, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 2.405494451522827, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6986277103424072, + "num_tokens": 201021146.0, + "step": 7766 + }, + { + "epoch": 0.8529540962003075, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3415143489837646, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7188025712966919, + "num_tokens": 201046614.0, + "step": 7767 + }, + { + "epoch": 0.8530639139029211, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3841629028320312, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7311458587646484, + "num_tokens": 201068484.0, + "step": 7768 + }, + { + "epoch": 0.8531737316055348, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4915220737457275, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7111670970916748, + "num_tokens": 201089170.0, + "step": 7769 + }, + { + "epoch": 0.8532835493081484, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5054147243499756, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6893333792686462, + "num_tokens": 201112542.0, + "step": 7770 + }, + { + "epoch": 0.8533933670107622, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.362370729446411, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7330064177513123, + "num_tokens": 201136180.0, + "step": 7771 + }, + { + "epoch": 0.8535031847133758, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.830457925796509, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7026337385177612, + "num_tokens": 201159136.0, + "step": 7772 + }, + { + "epoch": 0.8536130024159895, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.199573278427124, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6922529339790344, + "num_tokens": 201188466.0, + "step": 7773 + }, + { + "epoch": 0.8537228201186031, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.26611065864563, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6874931454658508, + "num_tokens": 201214870.0, + "step": 7774 + }, + { + "epoch": 0.8538326378212168, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4850645065307617, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7127733826637268, + "num_tokens": 201238516.0, + "step": 7775 + }, + { + "epoch": 0.8539424555238304, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.348191499710083, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7064868807792664, + "num_tokens": 201264492.0, + "step": 7776 + }, + { + "epoch": 0.8540522732264441, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.002007007598877, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.684561550617218, + "num_tokens": 201300458.0, + "step": 7777 + }, + { + "epoch": 0.8541620909290578, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.9174084663391113, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.71149080991745, + "num_tokens": 201337032.0, + "step": 7778 + }, + { + "epoch": 0.8542719086316715, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5494301319122314, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7083109021186829, + "num_tokens": 201360124.0, + "step": 7779 + }, + { + "epoch": 0.8543817263342851, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.6463067531585693, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7033524513244629, + "num_tokens": 201379881.0, + "step": 7780 + }, + { + "epoch": 0.8544915440368988, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.408987283706665, + "learning_rate": 1e-06, + "loss": 1.1246, + "mean_token_accuracy": 0.668017566204071, + "num_tokens": 201407020.0, + "step": 7781 + }, + { + "epoch": 0.8546013617395124, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5418660640716553, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7138411402702332, + "num_tokens": 201430308.0, + "step": 7782 + }, + { + "epoch": 0.854711179442126, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3766605854034424, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6879177093505859, + "num_tokens": 201456423.0, + "step": 7783 + }, + { + "epoch": 0.8548209971447397, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.410010576248169, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7144896388053894, + "num_tokens": 201479965.0, + "step": 7784 + }, + { + "epoch": 0.8549308148473534, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.666902780532837, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.682848334312439, + "num_tokens": 201506398.0, + "step": 7785 + }, + { + "epoch": 0.8550406325499671, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2517218589782715, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7034072875976562, + "num_tokens": 201533902.0, + "step": 7786 + }, + { + "epoch": 0.8551504502525807, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.511556386947632, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7174338102340698, + "num_tokens": 201555508.0, + "step": 7787 + }, + { + "epoch": 0.8552602679551944, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3714652061462402, + "learning_rate": 1e-06, + "loss": 1.1211, + "mean_token_accuracy": 0.6732374429702759, + "num_tokens": 201583580.0, + "step": 7788 + }, + { + "epoch": 0.855370085657808, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2318272590637207, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.697529137134552, + "num_tokens": 201614497.0, + "step": 7789 + }, + { + "epoch": 0.8554799033604217, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3744001388549805, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7103004455566406, + "num_tokens": 201637724.0, + "step": 7790 + }, + { + "epoch": 0.8555897210630353, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2408368587493896, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7188809514045715, + "num_tokens": 201663407.0, + "step": 7791 + }, + { + "epoch": 0.855699538765649, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.7232885360717773, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6984267234802246, + "num_tokens": 201683632.0, + "step": 7792 + }, + { + "epoch": 0.8558093564682627, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.503279209136963, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7157063484191895, + "num_tokens": 201704739.0, + "step": 7793 + }, + { + "epoch": 0.8559191741708764, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5037879943847656, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7070416212081909, + "num_tokens": 201727432.0, + "step": 7794 + }, + { + "epoch": 0.85602899187349, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.545119047164917, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7009627819061279, + "num_tokens": 201751819.0, + "step": 7795 + }, + { + "epoch": 0.8561388095761037, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.0068018436431885, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7104856967926025, + "num_tokens": 201783478.0, + "step": 7796 + }, + { + "epoch": 0.8562486272787173, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2836174964904785, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7067689895629883, + "num_tokens": 201808626.0, + "step": 7797 + }, + { + "epoch": 0.856358444981331, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5660600662231445, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7127607464790344, + "num_tokens": 201830947.0, + "step": 7798 + }, + { + "epoch": 0.8564682626839446, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4731733798980713, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6928818225860596, + "num_tokens": 201855299.0, + "step": 7799 + }, + { + "epoch": 0.8565780803865584, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.440929889678955, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7122358083724976, + "num_tokens": 201881901.0, + "step": 7800 + }, + { + "epoch": 0.856687898089172, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.126478433609009, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.697684645652771, + "num_tokens": 201914176.0, + "step": 7801 + }, + { + "epoch": 0.8567977157917857, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1824300289154053, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6988331079483032, + "num_tokens": 201941746.0, + "step": 7802 + }, + { + "epoch": 0.8569075334943993, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.229025363922119, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7193232774734497, + "num_tokens": 201968023.0, + "step": 7803 + }, + { + "epoch": 0.8570173511970129, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.31693172454834, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.709196925163269, + "num_tokens": 201992326.0, + "step": 7804 + }, + { + "epoch": 0.8571271688996266, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.899659514427185, + "learning_rate": 1e-06, + "loss": 1.1227, + "mean_token_accuracy": 0.6672379970550537, + "num_tokens": 202030782.0, + "step": 7805 + }, + { + "epoch": 0.8572369866022402, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.252528429031372, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7026695609092712, + "num_tokens": 202058757.0, + "step": 7806 + }, + { + "epoch": 0.857346804304854, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.436781883239746, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7003252506256104, + "num_tokens": 202083103.0, + "step": 7807 + }, + { + "epoch": 0.8574566220074676, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3083879947662354, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.692011296749115, + "num_tokens": 202112823.0, + "step": 7808 + }, + { + "epoch": 0.8575664397100813, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.0429582595825195, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7230556011199951, + "num_tokens": 202143862.0, + "step": 7809 + }, + { + "epoch": 0.8576762574126949, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.0489065647125244, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6820529103279114, + "num_tokens": 202176427.0, + "step": 7810 + }, + { + "epoch": 0.8577860751153086, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.215446710586548, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7203297019004822, + "num_tokens": 202205462.0, + "step": 7811 + }, + { + "epoch": 0.8578958928179222, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5070676803588867, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.708064079284668, + "num_tokens": 202228124.0, + "step": 7812 + }, + { + "epoch": 0.8580057105205359, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4378409385681152, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7089312076568604, + "num_tokens": 202253879.0, + "step": 7813 + }, + { + "epoch": 0.8581155282231496, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2218475341796875, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6969931125640869, + "num_tokens": 202283863.0, + "step": 7814 + }, + { + "epoch": 0.8582253459257633, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.402663230895996, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7081103324890137, + "num_tokens": 202308816.0, + "step": 7815 + }, + { + "epoch": 0.8583351636283769, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.7088027000427246, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7217745184898376, + "num_tokens": 202328376.0, + "step": 7816 + }, + { + "epoch": 0.8584449813309906, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2764267921447754, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.707693874835968, + "num_tokens": 202355544.0, + "step": 7817 + }, + { + "epoch": 0.8585547990336042, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.173402786254883, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7262858152389526, + "num_tokens": 202381441.0, + "step": 7818 + }, + { + "epoch": 0.8586646167362179, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.307223320007324, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.728916585445404, + "num_tokens": 202408014.0, + "step": 7819 + }, + { + "epoch": 0.8587744344388315, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.21514630317688, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7420065402984619, + "num_tokens": 202435919.0, + "step": 7820 + }, + { + "epoch": 0.8588842521414451, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.351612091064453, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7175430059432983, + "num_tokens": 202458627.0, + "step": 7821 + }, + { + "epoch": 0.8589940698440589, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.30312442779541, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7065539360046387, + "num_tokens": 202484299.0, + "step": 7822 + }, + { + "epoch": 0.8591038875466726, + "ewc_loss": 1.436471939086914e-05, + "grad_norm": 2.278214693069458, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7055097222328186, + "num_tokens": 202510414.0, + "step": 7823 + }, + { + "epoch": 0.8592137052492862, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3928048610687256, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7125133275985718, + "num_tokens": 202533513.0, + "step": 7824 + }, + { + "epoch": 0.8593235229518998, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.398893117904663, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6962937712669373, + "num_tokens": 202559780.0, + "step": 7825 + }, + { + "epoch": 0.8594333406545135, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.119147300720215, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6995108127593994, + "num_tokens": 202589175.0, + "step": 7826 + }, + { + "epoch": 0.8595431583571271, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5944983959198, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7149826884269714, + "num_tokens": 202611568.0, + "step": 7827 + }, + { + "epoch": 0.8596529760597408, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1723923683166504, + "learning_rate": 1e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6752398014068604, + "num_tokens": 202640644.0, + "step": 7828 + }, + { + "epoch": 0.8597627937623545, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.943591594696045, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6996967792510986, + "num_tokens": 202674639.0, + "step": 7829 + }, + { + "epoch": 0.8598726114649682, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.310668706893921, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6894956231117249, + "num_tokens": 202700287.0, + "step": 7830 + }, + { + "epoch": 0.8599824291675818, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.9634288549423218, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6790788769721985, + "num_tokens": 202736629.0, + "step": 7831 + }, + { + "epoch": 0.8600922468701955, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.526155471801758, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7148883938789368, + "num_tokens": 202757926.0, + "step": 7832 + }, + { + "epoch": 0.8602020645728091, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3124923706054688, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6956073045730591, + "num_tokens": 202784249.0, + "step": 7833 + }, + { + "epoch": 0.8603118822754228, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3749749660491943, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6962878704071045, + "num_tokens": 202809701.0, + "step": 7834 + }, + { + "epoch": 0.8604216999780364, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2657408714294434, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6986218690872192, + "num_tokens": 202837782.0, + "step": 7835 + }, + { + "epoch": 0.8605315176806502, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3415753841400146, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7229343056678772, + "num_tokens": 202864615.0, + "step": 7836 + }, + { + "epoch": 0.8606413353832638, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.242476463317871, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.705869197845459, + "num_tokens": 202891672.0, + "step": 7837 + }, + { + "epoch": 0.8607511530858775, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2667863368988037, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6957291960716248, + "num_tokens": 202922336.0, + "step": 7838 + }, + { + "epoch": 0.8608609707884911, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2666773796081543, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6947273015975952, + "num_tokens": 202949143.0, + "step": 7839 + }, + { + "epoch": 0.8609707884911048, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5291664600372314, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7377790212631226, + "num_tokens": 202967791.0, + "step": 7840 + }, + { + "epoch": 0.8610806061937184, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.423002243041992, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.690540611743927, + "num_tokens": 202990516.0, + "step": 7841 + }, + { + "epoch": 0.861190423896332, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5447826385498047, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6981067657470703, + "num_tokens": 203014639.0, + "step": 7842 + }, + { + "epoch": 0.8613002415989458, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.307415246963501, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7096896171569824, + "num_tokens": 203042275.0, + "step": 7843 + }, + { + "epoch": 0.8614100593015594, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5851728916168213, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6951427459716797, + "num_tokens": 203066848.0, + "step": 7844 + }, + { + "epoch": 0.8615198770041731, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1506850719451904, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6819157600402832, + "num_tokens": 203097751.0, + "step": 7845 + }, + { + "epoch": 0.8616296947067867, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.290609836578369, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6876397132873535, + "num_tokens": 203124755.0, + "step": 7846 + }, + { + "epoch": 0.8617395124094004, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.60046124458313, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.7026564478874207, + "num_tokens": 203146497.0, + "step": 7847 + }, + { + "epoch": 0.861849330112014, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5605154037475586, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7107234001159668, + "num_tokens": 203167468.0, + "step": 7848 + }, + { + "epoch": 0.8619591478146277, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1308178901672363, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7168232798576355, + "num_tokens": 203198814.0, + "step": 7849 + }, + { + "epoch": 0.8620689655172413, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.357395887374878, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.702678382396698, + "num_tokens": 203224152.0, + "step": 7850 + }, + { + "epoch": 0.8621787832198551, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2967169284820557, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7301715612411499, + "num_tokens": 203248679.0, + "step": 7851 + }, + { + "epoch": 0.8622886009224687, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1094038486480713, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6843667030334473, + "num_tokens": 203279320.0, + "step": 7852 + }, + { + "epoch": 0.8623984186250824, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3553545475006104, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7138058543205261, + "num_tokens": 203303420.0, + "step": 7853 + }, + { + "epoch": 0.862508236327696, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.586636543273926, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7103780508041382, + "num_tokens": 203327137.0, + "step": 7854 + }, + { + "epoch": 0.8626180540303097, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3533217906951904, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7134901881217957, + "num_tokens": 203353839.0, + "step": 7855 + }, + { + "epoch": 0.8627278717329233, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2119147777557373, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6870774030685425, + "num_tokens": 203383795.0, + "step": 7856 + }, + { + "epoch": 0.862837689435537, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5539016723632812, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7123268842697144, + "num_tokens": 203407162.0, + "step": 7857 + }, + { + "epoch": 0.8629475071381507, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3451781272888184, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.698844313621521, + "num_tokens": 203434099.0, + "step": 7858 + }, + { + "epoch": 0.8630573248407644, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4908318519592285, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7155566215515137, + "num_tokens": 203456548.0, + "step": 7859 + }, + { + "epoch": 0.863167142543378, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.1950299739837646, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7151139378547668, + "num_tokens": 203484169.0, + "step": 7860 + }, + { + "epoch": 0.8632769602459917, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3607397079467773, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6924775838851929, + "num_tokens": 203509125.0, + "step": 7861 + }, + { + "epoch": 0.8633867779486053, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4184067249298096, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7138171792030334, + "num_tokens": 203534755.0, + "step": 7862 + }, + { + "epoch": 0.863496595651219, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.408674716949463, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6922292709350586, + "num_tokens": 203558568.0, + "step": 7863 + }, + { + "epoch": 0.8636064133538326, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.373471975326538, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.706526517868042, + "num_tokens": 203582837.0, + "step": 7864 + }, + { + "epoch": 0.8637162310564463, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2570993900299072, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7138280868530273, + "num_tokens": 203611467.0, + "step": 7865 + }, + { + "epoch": 0.86382604875906, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4174747467041016, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7086048126220703, + "num_tokens": 203636023.0, + "step": 7866 + }, + { + "epoch": 0.8639358664616736, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3753390312194824, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6996316313743591, + "num_tokens": 203660568.0, + "step": 7867 + }, + { + "epoch": 0.8640456841642873, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.317863941192627, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6899280548095703, + "num_tokens": 203688505.0, + "step": 7868 + }, + { + "epoch": 0.8641555018669009, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2290079593658447, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.679611086845398, + "num_tokens": 203718239.0, + "step": 7869 + }, + { + "epoch": 0.8642653195695146, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.312926769256592, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7131214141845703, + "num_tokens": 203743844.0, + "step": 7870 + }, + { + "epoch": 0.8643751372721282, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4137346744537354, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7078450918197632, + "num_tokens": 203767153.0, + "step": 7871 + }, + { + "epoch": 0.864484954974742, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.3028342723846436, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7031515836715698, + "num_tokens": 203794917.0, + "step": 7872 + }, + { + "epoch": 0.8645947726773556, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5754024982452393, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7029576301574707, + "num_tokens": 203817801.0, + "step": 7873 + }, + { + "epoch": 0.8647045903799693, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.4850878715515137, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6973730325698853, + "num_tokens": 203841656.0, + "step": 7874 + }, + { + "epoch": 0.8648144080825829, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.5350680351257324, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7165536284446716, + "num_tokens": 203864128.0, + "step": 7875 + }, + { + "epoch": 0.8649242257851966, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.4396843910217285, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7219257354736328, + "num_tokens": 203886915.0, + "step": 7876 + }, + { + "epoch": 0.8650340434878102, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.2515978813171387, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7132568955421448, + "num_tokens": 203912550.0, + "step": 7877 + }, + { + "epoch": 0.8651438611904239, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.523244619369507, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7065712213516235, + "num_tokens": 203935089.0, + "step": 7878 + }, + { + "epoch": 0.8652536788930375, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.258103609085083, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7334232330322266, + "num_tokens": 203963748.0, + "step": 7879 + }, + { + "epoch": 0.8653634965956513, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.56877064704895, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7194393873214722, + "num_tokens": 203984878.0, + "step": 7880 + }, + { + "epoch": 0.8654733142982649, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.160693883895874, + "learning_rate": 1e-06, + "loss": 1.1214, + "mean_token_accuracy": 0.6784888505935669, + "num_tokens": 204014695.0, + "step": 7881 + }, + { + "epoch": 0.8655831320008786, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5969817638397217, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7174988985061646, + "num_tokens": 204035281.0, + "step": 7882 + }, + { + "epoch": 0.8656929497034922, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.461150884628296, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7136571407318115, + "num_tokens": 204059679.0, + "step": 7883 + }, + { + "epoch": 0.8658027674061058, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5778541564941406, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7124278545379639, + "num_tokens": 204081454.0, + "step": 7884 + }, + { + "epoch": 0.8659125851087195, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5516343116760254, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7212215065956116, + "num_tokens": 204103325.0, + "step": 7885 + }, + { + "epoch": 0.8660224028113331, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.7312803268432617, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7033950090408325, + "num_tokens": 204124511.0, + "step": 7886 + }, + { + "epoch": 0.8661322205139469, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.337153434753418, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7042754888534546, + "num_tokens": 204151632.0, + "step": 7887 + }, + { + "epoch": 0.8662420382165605, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.325157880783081, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6842471361160278, + "num_tokens": 204177986.0, + "step": 7888 + }, + { + "epoch": 0.8663518559191742, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.345555543899536, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6960283517837524, + "num_tokens": 204205111.0, + "step": 7889 + }, + { + "epoch": 0.8664616736217878, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.432406425476074, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7013329863548279, + "num_tokens": 204227601.0, + "step": 7890 + }, + { + "epoch": 0.8665714913244015, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.4645912647247314, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7210943698883057, + "num_tokens": 204250245.0, + "step": 7891 + }, + { + "epoch": 0.8666813090270151, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5979390144348145, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7068437337875366, + "num_tokens": 204274328.0, + "step": 7892 + }, + { + "epoch": 0.8667911267296288, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5704901218414307, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7185852527618408, + "num_tokens": 204297123.0, + "step": 7893 + }, + { + "epoch": 0.8669009444322425, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.132265090942383, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7074185609817505, + "num_tokens": 204327408.0, + "step": 7894 + }, + { + "epoch": 0.8670107621348562, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.686877489089966, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7165665030479431, + "num_tokens": 204349246.0, + "step": 7895 + }, + { + "epoch": 0.8671205798374698, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.7564098834991455, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7268860340118408, + "num_tokens": 204367333.0, + "step": 7896 + }, + { + "epoch": 0.8672303975400835, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.02760648727417, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7249634265899658, + "num_tokens": 204400348.0, + "step": 7897 + }, + { + "epoch": 0.8673402152426971, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.4173173904418945, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6850287914276123, + "num_tokens": 204425388.0, + "step": 7898 + }, + { + "epoch": 0.8674500329453108, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.657040596008301, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6920574307441711, + "num_tokens": 204449858.0, + "step": 7899 + }, + { + "epoch": 0.8675598506479244, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.3742642402648926, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.684302031993866, + "num_tokens": 204476824.0, + "step": 7900 + }, + { + "epoch": 0.8676696683505382, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.1541426181793213, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.682902455329895, + "num_tokens": 204505730.0, + "step": 7901 + }, + { + "epoch": 0.8677794860531518, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.7844908237457275, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7028605937957764, + "num_tokens": 204526003.0, + "step": 7902 + }, + { + "epoch": 0.8678893037557655, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.608799457550049, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6955821514129639, + "num_tokens": 204548218.0, + "step": 7903 + }, + { + "epoch": 0.8679991214583791, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.263821840286255, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6928845643997192, + "num_tokens": 204576206.0, + "step": 7904 + }, + { + "epoch": 0.8681089391609927, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.101961374282837, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6959414482116699, + "num_tokens": 204608378.0, + "step": 7905 + }, + { + "epoch": 0.8682187568636064, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.6830484867095947, + "learning_rate": 1e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6854152083396912, + "num_tokens": 204628158.0, + "step": 7906 + }, + { + "epoch": 0.86832857456622, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5697247982025146, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.713086724281311, + "num_tokens": 204650319.0, + "step": 7907 + }, + { + "epoch": 0.8684383922688337, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.874652862548828, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7173470854759216, + "num_tokens": 204668863.0, + "step": 7908 + }, + { + "epoch": 0.8685482099714474, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.3818225860595703, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.717991054058075, + "num_tokens": 204692670.0, + "step": 7909 + }, + { + "epoch": 0.8686580276740611, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5382325649261475, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6862828731536865, + "num_tokens": 204715673.0, + "step": 7910 + }, + { + "epoch": 0.8687678453766747, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5443129539489746, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7313115000724792, + "num_tokens": 204738523.0, + "step": 7911 + }, + { + "epoch": 0.8688776630792884, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.2390964031219482, + "learning_rate": 1e-06, + "loss": 1.1123, + "mean_token_accuracy": 0.6710219979286194, + "num_tokens": 204767872.0, + "step": 7912 + }, + { + "epoch": 0.868987480781902, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.288045883178711, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.693533182144165, + "num_tokens": 204795170.0, + "step": 7913 + }, + { + "epoch": 0.8690972984845157, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.194424867630005, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6832773685455322, + "num_tokens": 204825372.0, + "step": 7914 + }, + { + "epoch": 0.8692071161871293, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.255500078201294, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.6972293853759766, + "num_tokens": 204853010.0, + "step": 7915 + }, + { + "epoch": 0.8693169338897431, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.1661782264709473, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7007782459259033, + "num_tokens": 204883722.0, + "step": 7916 + }, + { + "epoch": 0.8694267515923567, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.3711917400360107, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7067986130714417, + "num_tokens": 204908704.0, + "step": 7917 + }, + { + "epoch": 0.8695365692949704, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.457683801651001, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7114931344985962, + "num_tokens": 204931118.0, + "step": 7918 + }, + { + "epoch": 0.869646386997584, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.1889922618865967, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6767253279685974, + "num_tokens": 204959245.0, + "step": 7919 + }, + { + "epoch": 0.8697562047001977, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.634716510772705, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7020177841186523, + "num_tokens": 204980960.0, + "step": 7920 + }, + { + "epoch": 0.8698660224028113, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.1258628368377686, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.709605872631073, + "num_tokens": 205012385.0, + "step": 7921 + }, + { + "epoch": 0.869975840105425, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.1651196479797363, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7043344974517822, + "num_tokens": 205042316.0, + "step": 7922 + }, + { + "epoch": 0.8700856578080387, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.2913687229156494, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7007088661193848, + "num_tokens": 205069593.0, + "step": 7923 + }, + { + "epoch": 0.8701954755106523, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.245604991912842, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7090953588485718, + "num_tokens": 205095869.0, + "step": 7924 + }, + { + "epoch": 0.870305293213266, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.4776418209075928, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6871190071105957, + "num_tokens": 205120650.0, + "step": 7925 + }, + { + "epoch": 0.8704151109158796, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.525352716445923, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6953809857368469, + "num_tokens": 205145191.0, + "step": 7926 + }, + { + "epoch": 0.8705249286184933, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.4665000438690186, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7132886648178101, + "num_tokens": 205171048.0, + "step": 7927 + }, + { + "epoch": 0.8706347463211069, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.21154522895813, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7133436799049377, + "num_tokens": 205197124.0, + "step": 7928 + }, + { + "epoch": 0.8707445640237206, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.0816233158111572, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7049969434738159, + "num_tokens": 205228334.0, + "step": 7929 + }, + { + "epoch": 0.8708543817263343, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.343778371810913, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7187364101409912, + "num_tokens": 205252781.0, + "step": 7930 + }, + { + "epoch": 0.870964199428948, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.36210298538208, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.698395848274231, + "num_tokens": 205279568.0, + "step": 7931 + }, + { + "epoch": 0.8710740171315616, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.7577123641967773, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.717189371585846, + "num_tokens": 205298739.0, + "step": 7932 + }, + { + "epoch": 0.8711838348341753, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.452031135559082, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.724353015422821, + "num_tokens": 205323281.0, + "step": 7933 + }, + { + "epoch": 0.8712936525367889, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5440006256103516, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.702203631401062, + "num_tokens": 205346345.0, + "step": 7934 + }, + { + "epoch": 0.8714034702394026, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.5544328689575195, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7111546993255615, + "num_tokens": 205367389.0, + "step": 7935 + }, + { + "epoch": 0.8715132879420162, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.445291519165039, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6978306770324707, + "num_tokens": 205390695.0, + "step": 7936 + }, + { + "epoch": 0.87162310564463, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.411309242248535, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6908811330795288, + "num_tokens": 205415769.0, + "step": 7937 + }, + { + "epoch": 0.8717329233472436, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.420226812362671, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7169286012649536, + "num_tokens": 205438970.0, + "step": 7938 + }, + { + "epoch": 0.8718427410498573, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.3585095405578613, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7027488946914673, + "num_tokens": 205463899.0, + "step": 7939 + }, + { + "epoch": 0.8719525587524709, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 2.2405121326446533, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6974912881851196, + "num_tokens": 205493986.0, + "step": 7940 + }, + { + "epoch": 0.8720623764550846, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.6179537773132324, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7025023698806763, + "num_tokens": 205516520.0, + "step": 7941 + }, + { + "epoch": 0.8721721941576982, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.3138158321380615, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6896988749504089, + "num_tokens": 205544956.0, + "step": 7942 + }, + { + "epoch": 0.8722820118603118, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.649519443511963, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7026776671409607, + "num_tokens": 205565308.0, + "step": 7943 + }, + { + "epoch": 0.8723918295629255, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2269203662872314, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.697435736656189, + "num_tokens": 205592370.0, + "step": 7944 + }, + { + "epoch": 0.8725016472655392, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2083699703216553, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6943597793579102, + "num_tokens": 205622412.0, + "step": 7945 + }, + { + "epoch": 0.8726114649681529, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.448019504547119, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.703406572341919, + "num_tokens": 205646197.0, + "step": 7946 + }, + { + "epoch": 0.8727212826707665, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.430989980697632, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7075493335723877, + "num_tokens": 205669733.0, + "step": 7947 + }, + { + "epoch": 0.8728311003733802, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.210714817047119, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7024986147880554, + "num_tokens": 205697984.0, + "step": 7948 + }, + { + "epoch": 0.8729409180759938, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.6070261001586914, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6865752935409546, + "num_tokens": 205722254.0, + "step": 7949 + }, + { + "epoch": 0.8730507357786075, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.5162200927734375, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7021638751029968, + "num_tokens": 205748538.0, + "step": 7950 + }, + { + "epoch": 0.8731605534812211, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.437368154525757, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7325843572616577, + "num_tokens": 205772601.0, + "step": 7951 + }, + { + "epoch": 0.8732703711838349, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.4336907863616943, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6919924020767212, + "num_tokens": 205796423.0, + "step": 7952 + }, + { + "epoch": 0.8733801888864485, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.223160982131958, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7243137955665588, + "num_tokens": 205826257.0, + "step": 7953 + }, + { + "epoch": 0.8734900065890622, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.3771321773529053, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6935181617736816, + "num_tokens": 205850383.0, + "step": 7954 + }, + { + "epoch": 0.8735998242916758, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.7231457233428955, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7021857500076294, + "num_tokens": 205869212.0, + "step": 7955 + }, + { + "epoch": 0.8737096419942895, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.55466890335083, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7100090384483337, + "num_tokens": 205891962.0, + "step": 7956 + }, + { + "epoch": 0.8738194596969031, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.5278828144073486, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7119837403297424, + "num_tokens": 205916887.0, + "step": 7957 + }, + { + "epoch": 0.8739292773995168, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.197005271911621, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7076504230499268, + "num_tokens": 205943337.0, + "step": 7958 + }, + { + "epoch": 0.8740390951021305, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2341530323028564, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6977987289428711, + "num_tokens": 205969059.0, + "step": 7959 + }, + { + "epoch": 0.8741489128047442, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.1477198600769043, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7189454436302185, + "num_tokens": 205998041.0, + "step": 7960 + }, + { + "epoch": 0.8742587305073578, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.3076446056365967, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7129804491996765, + "num_tokens": 206022857.0, + "step": 7961 + }, + { + "epoch": 0.8743685482099715, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.208787441253662, + "learning_rate": 1e-06, + "loss": 1.1122, + "mean_token_accuracy": 0.6825301647186279, + "num_tokens": 206051159.0, + "step": 7962 + }, + { + "epoch": 0.8744783659125851, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.216614246368408, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7151548266410828, + "num_tokens": 206079039.0, + "step": 7963 + }, + { + "epoch": 0.8745881836151987, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.381035804748535, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7014278173446655, + "num_tokens": 206103595.0, + "step": 7964 + }, + { + "epoch": 0.8746980013178124, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.363341808319092, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6968271732330322, + "num_tokens": 206128831.0, + "step": 7965 + }, + { + "epoch": 0.8748078190204261, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.3924996852874756, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6979421377182007, + "num_tokens": 206153310.0, + "step": 7966 + }, + { + "epoch": 0.8749176367230398, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.1763224601745605, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7233238816261292, + "num_tokens": 206179791.0, + "step": 7967 + }, + { + "epoch": 0.8750274544256534, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.730937957763672, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7133503556251526, + "num_tokens": 206198392.0, + "step": 7968 + }, + { + "epoch": 0.8751372721282671, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.30655837059021, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.7047050595283508, + "num_tokens": 206226572.0, + "step": 7969 + }, + { + "epoch": 0.8752470898308807, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.520197629928589, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7333879470825195, + "num_tokens": 206247127.0, + "step": 7970 + }, + { + "epoch": 0.8753569075334944, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.4226715564727783, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7114701271057129, + "num_tokens": 206271724.0, + "step": 7971 + }, + { + "epoch": 0.875466725236108, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.0642499923706055, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6969912052154541, + "num_tokens": 206302578.0, + "step": 7972 + }, + { + "epoch": 0.8755765429387217, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.403216600418091, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7115541100502014, + "num_tokens": 206328020.0, + "step": 7973 + }, + { + "epoch": 0.8756863606413354, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.237675905227661, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7172847390174866, + "num_tokens": 206353497.0, + "step": 7974 + }, + { + "epoch": 0.8757961783439491, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.268686294555664, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7048859000205994, + "num_tokens": 206377663.0, + "step": 7975 + }, + { + "epoch": 0.8759059960465627, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.258734703063965, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.702688992023468, + "num_tokens": 206404111.0, + "step": 7976 + }, + { + "epoch": 0.8760158137491764, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.5296850204467773, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7037353515625, + "num_tokens": 206425714.0, + "step": 7977 + }, + { + "epoch": 0.87612563145179, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2135677337646484, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6968506574630737, + "num_tokens": 206454801.0, + "step": 7978 + }, + { + "epoch": 0.8762354491544037, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.370537281036377, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.6848342418670654, + "num_tokens": 206480724.0, + "step": 7979 + }, + { + "epoch": 0.8763452668570173, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2565758228302, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7131968140602112, + "num_tokens": 206508803.0, + "step": 7980 + }, + { + "epoch": 0.8764550845596311, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.3910555839538574, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7010402083396912, + "num_tokens": 206532791.0, + "step": 7981 + }, + { + "epoch": 0.8765649022622447, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.8814620971679688, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7042428851127625, + "num_tokens": 206550452.0, + "step": 7982 + }, + { + "epoch": 0.8766747199648584, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.2967875003814697, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7006279230117798, + "num_tokens": 206575311.0, + "step": 7983 + }, + { + "epoch": 0.876784537667472, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.1096103191375732, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6910645961761475, + "num_tokens": 206606387.0, + "step": 7984 + }, + { + "epoch": 0.8768943553700856, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 2.250196933746338, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7197096347808838, + "num_tokens": 206631078.0, + "step": 7985 + }, + { + "epoch": 0.8770041730726993, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4247193336486816, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7008109092712402, + "num_tokens": 206656326.0, + "step": 7986 + }, + { + "epoch": 0.8771139907753129, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.107046127319336, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.698070228099823, + "num_tokens": 206687052.0, + "step": 7987 + }, + { + "epoch": 0.8772238084779267, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.646503210067749, + "learning_rate": 1e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.6726222038269043, + "num_tokens": 206708479.0, + "step": 7988 + }, + { + "epoch": 0.8773336261805403, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.26639461517334, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6900141835212708, + "num_tokens": 206734578.0, + "step": 7989 + }, + { + "epoch": 0.877443443883154, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.23276686668396, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7194474935531616, + "num_tokens": 206761012.0, + "step": 7990 + }, + { + "epoch": 0.8775532615857676, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.415527820587158, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7239074110984802, + "num_tokens": 206782739.0, + "step": 7991 + }, + { + "epoch": 0.8776630792883813, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.103602409362793, + "learning_rate": 1e-06, + "loss": 1.0945, + "mean_token_accuracy": 0.6761698126792908, + "num_tokens": 206816363.0, + "step": 7992 + }, + { + "epoch": 0.8777728969909949, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4458987712860107, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7174941897392273, + "num_tokens": 206840364.0, + "step": 7993 + }, + { + "epoch": 0.8778827146936086, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4076313972473145, + "learning_rate": 1e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.6868130564689636, + "num_tokens": 206868088.0, + "step": 7994 + }, + { + "epoch": 0.8779925323962223, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.5790553092956543, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7019652128219604, + "num_tokens": 206889482.0, + "step": 7995 + }, + { + "epoch": 0.878102350098836, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.592410087585449, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7044318914413452, + "num_tokens": 206912349.0, + "step": 7996 + }, + { + "epoch": 0.8782121678014496, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3130412101745605, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7173670530319214, + "num_tokens": 206937852.0, + "step": 7997 + }, + { + "epoch": 0.8783219855040633, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4449081420898438, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7269061803817749, + "num_tokens": 206961008.0, + "step": 7998 + }, + { + "epoch": 0.8784318032066769, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.474459171295166, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6958639621734619, + "num_tokens": 206983322.0, + "step": 7999 + }, + { + "epoch": 0.8785416209092906, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4274802207946777, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6996747851371765, + "num_tokens": 207007351.0, + "step": 8000 + }, + { + "epoch": 0.8786514386119042, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1502909660339355, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7074665427207947, + "num_tokens": 207038395.0, + "step": 8001 + }, + { + "epoch": 0.8787612563145178, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4653892517089844, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7062591314315796, + "num_tokens": 207062632.0, + "step": 8002 + }, + { + "epoch": 0.8788710740171316, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1907832622528076, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6844853758811951, + "num_tokens": 207092243.0, + "step": 8003 + }, + { + "epoch": 0.8789808917197452, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.12060809135437, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6984731554985046, + "num_tokens": 207122819.0, + "step": 8004 + }, + { + "epoch": 0.8790907094223589, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.43998384475708, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7113969326019287, + "num_tokens": 207147033.0, + "step": 8005 + }, + { + "epoch": 0.8792005271249725, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.354898691177368, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6867415904998779, + "num_tokens": 207172566.0, + "step": 8006 + }, + { + "epoch": 0.8793103448275862, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1313366889953613, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.699975848197937, + "num_tokens": 207201224.0, + "step": 8007 + }, + { + "epoch": 0.8794201625301998, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2625603675842285, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7190848588943481, + "num_tokens": 207228077.0, + "step": 8008 + }, + { + "epoch": 0.8795299802328135, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.249265432357788, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6795947551727295, + "num_tokens": 207256588.0, + "step": 8009 + }, + { + "epoch": 0.8796397979354272, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.446730136871338, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7158169746398926, + "num_tokens": 207278042.0, + "step": 8010 + }, + { + "epoch": 0.8797496156380409, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.320117473602295, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6856154799461365, + "num_tokens": 207305518.0, + "step": 8011 + }, + { + "epoch": 0.8798594333406545, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2058498859405518, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7060400247573853, + "num_tokens": 207335128.0, + "step": 8012 + }, + { + "epoch": 0.8799692510432682, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.59660267829895, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7151771783828735, + "num_tokens": 207357774.0, + "step": 8013 + }, + { + "epoch": 0.8800790687458818, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.347093105316162, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7097204923629761, + "num_tokens": 207383683.0, + "step": 8014 + }, + { + "epoch": 0.8801888864484955, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.187516927719116, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7113831639289856, + "num_tokens": 207413913.0, + "step": 8015 + }, + { + "epoch": 0.8802987041511091, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 1.97713041305542, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7050026059150696, + "num_tokens": 207449426.0, + "step": 8016 + }, + { + "epoch": 0.8804085218537229, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.307140588760376, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.704437255859375, + "num_tokens": 207476016.0, + "step": 8017 + }, + { + "epoch": 0.8805183395563365, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4923758506774902, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7126312255859375, + "num_tokens": 207497955.0, + "step": 8018 + }, + { + "epoch": 0.8806281572589502, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6163623332977295, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7363064289093018, + "num_tokens": 207516932.0, + "step": 8019 + }, + { + "epoch": 0.8807379749615638, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3100385665893555, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6845705509185791, + "num_tokens": 207545331.0, + "step": 8020 + }, + { + "epoch": 0.8808477926641775, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.53178071975708, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.708281397819519, + "num_tokens": 207568324.0, + "step": 8021 + }, + { + "epoch": 0.8809576103667911, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.470163106918335, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7188841700553894, + "num_tokens": 207590390.0, + "step": 8022 + }, + { + "epoch": 0.8810674280694047, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.280005693435669, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7246246337890625, + "num_tokens": 207614861.0, + "step": 8023 + }, + { + "epoch": 0.8811772457720185, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.217571973800659, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6978127956390381, + "num_tokens": 207643294.0, + "step": 8024 + }, + { + "epoch": 0.8812870634746321, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.7033708095550537, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6998710632324219, + "num_tokens": 207663473.0, + "step": 8025 + }, + { + "epoch": 0.8813968811772458, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.223665237426758, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7104138135910034, + "num_tokens": 207693923.0, + "step": 8026 + }, + { + "epoch": 0.8815066988798594, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2827892303466797, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7282363176345825, + "num_tokens": 207722358.0, + "step": 8027 + }, + { + "epoch": 0.8816165165824731, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3705618381500244, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7084530591964722, + "num_tokens": 207748579.0, + "step": 8028 + }, + { + "epoch": 0.8817263342850867, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.57820725440979, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7198953032493591, + "num_tokens": 207771070.0, + "step": 8029 + }, + { + "epoch": 0.8818361519877004, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.7251017093658447, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7157586812973022, + "num_tokens": 207789767.0, + "step": 8030 + }, + { + "epoch": 0.881945969690314, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4520113468170166, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7177649736404419, + "num_tokens": 207810612.0, + "step": 8031 + }, + { + "epoch": 0.8820557873929278, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3110170364379883, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6942019462585449, + "num_tokens": 207836096.0, + "step": 8032 + }, + { + "epoch": 0.8821656050955414, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 1.9684149026870728, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6997966170310974, + "num_tokens": 207868083.0, + "step": 8033 + }, + { + "epoch": 0.8822754227981551, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3928043842315674, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7180079221725464, + "num_tokens": 207892881.0, + "step": 8034 + }, + { + "epoch": 0.8823852405007687, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1801605224609375, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6926039457321167, + "num_tokens": 207924786.0, + "step": 8035 + }, + { + "epoch": 0.8824950582033824, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.7389373779296875, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7207650542259216, + "num_tokens": 207946705.0, + "step": 8036 + }, + { + "epoch": 0.882604875905996, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4271492958068848, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7305673956871033, + "num_tokens": 207968344.0, + "step": 8037 + }, + { + "epoch": 0.8827146936086097, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4672043323516846, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7070528864860535, + "num_tokens": 207991588.0, + "step": 8038 + }, + { + "epoch": 0.8828245113112234, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2328860759735107, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.693600058555603, + "num_tokens": 208020156.0, + "step": 8039 + }, + { + "epoch": 0.8829343290138371, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.682440757751465, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7102715969085693, + "num_tokens": 208039776.0, + "step": 8040 + }, + { + "epoch": 0.8830441467164507, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 1.9546078443527222, + "learning_rate": 1e-06, + "loss": 1.1113, + "mean_token_accuracy": 0.6810410022735596, + "num_tokens": 208078399.0, + "step": 8041 + }, + { + "epoch": 0.8831539644190644, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.5472335815429688, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7111865282058716, + "num_tokens": 208098993.0, + "step": 8042 + }, + { + "epoch": 0.883263782121678, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.087766408920288, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7057314515113831, + "num_tokens": 208132064.0, + "step": 8043 + }, + { + "epoch": 0.8833735998242916, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.211099147796631, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7293941974639893, + "num_tokens": 208158784.0, + "step": 8044 + }, + { + "epoch": 0.8834834175269053, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2864534854888916, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6999073028564453, + "num_tokens": 208184449.0, + "step": 8045 + }, + { + "epoch": 0.883593235229519, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6003379821777344, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7190069556236267, + "num_tokens": 208204624.0, + "step": 8046 + }, + { + "epoch": 0.8837030529321327, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6169679164886475, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6977657079696655, + "num_tokens": 208227598.0, + "step": 8047 + }, + { + "epoch": 0.8838128706347463, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4389660358428955, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7225451469421387, + "num_tokens": 208251223.0, + "step": 8048 + }, + { + "epoch": 0.88392268833736, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6711599826812744, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6993203163146973, + "num_tokens": 208271542.0, + "step": 8049 + }, + { + "epoch": 0.8840325060399736, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.173398017883301, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.698411226272583, + "num_tokens": 208300464.0, + "step": 8050 + }, + { + "epoch": 0.8841423237425873, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.628864049911499, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7120239734649658, + "num_tokens": 208320189.0, + "step": 8051 + }, + { + "epoch": 0.8842521414452009, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.101858615875244, + "learning_rate": 1e-06, + "loss": 1.1109, + "mean_token_accuracy": 0.6895684599876404, + "num_tokens": 208355152.0, + "step": 8052 + }, + { + "epoch": 0.8843619591478147, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.162074565887451, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.683822512626648, + "num_tokens": 208383984.0, + "step": 8053 + }, + { + "epoch": 0.8844717768504283, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2199223041534424, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7143480181694031, + "num_tokens": 208410915.0, + "step": 8054 + }, + { + "epoch": 0.884581594553042, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2182528972625732, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7420538663864136, + "num_tokens": 208436148.0, + "step": 8055 + }, + { + "epoch": 0.8846914122556556, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2948052883148193, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7078873515129089, + "num_tokens": 208465224.0, + "step": 8056 + }, + { + "epoch": 0.8848012299582693, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.10351824760437, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7318326234817505, + "num_tokens": 208494111.0, + "step": 8057 + }, + { + "epoch": 0.8849110476608829, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.5989768505096436, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.72247314453125, + "num_tokens": 208514390.0, + "step": 8058 + }, + { + "epoch": 0.8850208653634966, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2000954151153564, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6965757608413696, + "num_tokens": 208543421.0, + "step": 8059 + }, + { + "epoch": 0.8851306830661102, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3586950302124023, + "learning_rate": 1e-06, + "loss": 1.0985, + "mean_token_accuracy": 0.6850218772888184, + "num_tokens": 208570744.0, + "step": 8060 + }, + { + "epoch": 0.885240500768724, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.340014934539795, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7151923179626465, + "num_tokens": 208596170.0, + "step": 8061 + }, + { + "epoch": 0.8853503184713376, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.298322916030884, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.694514811038971, + "num_tokens": 208625206.0, + "step": 8062 + }, + { + "epoch": 0.8854601361739513, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6468658447265625, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7223612666130066, + "num_tokens": 208645910.0, + "step": 8063 + }, + { + "epoch": 0.8855699538765649, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.273678779602051, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7036155462265015, + "num_tokens": 208672805.0, + "step": 8064 + }, + { + "epoch": 0.8856797715791785, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2109181880950928, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7047683596611023, + "num_tokens": 208703429.0, + "step": 8065 + }, + { + "epoch": 0.8857895892817922, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2859556674957275, + "learning_rate": 1e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.6803893446922302, + "num_tokens": 208732360.0, + "step": 8066 + }, + { + "epoch": 0.8858994069844058, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3031065464019775, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.698094367980957, + "num_tokens": 208759178.0, + "step": 8067 + }, + { + "epoch": 0.8860092246870196, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.6460070610046387, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7113317251205444, + "num_tokens": 208778917.0, + "step": 8068 + }, + { + "epoch": 0.8861190423896332, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.207467794418335, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7073825597763062, + "num_tokens": 208808384.0, + "step": 8069 + }, + { + "epoch": 0.8862288600922469, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.5624420642852783, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7108929753303528, + "num_tokens": 208829418.0, + "step": 8070 + }, + { + "epoch": 0.8863386777948605, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.224342107772827, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.7018964290618896, + "num_tokens": 208857521.0, + "step": 8071 + }, + { + "epoch": 0.8864484954974742, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.382222890853882, + "learning_rate": 1e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6937665939331055, + "num_tokens": 208881906.0, + "step": 8072 + }, + { + "epoch": 0.8865583132000878, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.0437769889831543, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7302203178405762, + "num_tokens": 208914194.0, + "step": 8073 + }, + { + "epoch": 0.8866681309027015, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4248833656311035, + "learning_rate": 1e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6829925179481506, + "num_tokens": 208939704.0, + "step": 8074 + }, + { + "epoch": 0.8867779486053152, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3248586654663086, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7264221906661987, + "num_tokens": 208963714.0, + "step": 8075 + }, + { + "epoch": 0.8868877663079289, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3173561096191406, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6870850324630737, + "num_tokens": 208992718.0, + "step": 8076 + }, + { + "epoch": 0.8869975840105425, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1668615341186523, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.713650107383728, + "num_tokens": 209021300.0, + "step": 8077 + }, + { + "epoch": 0.8871074017131562, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3673207759857178, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6959857940673828, + "num_tokens": 209048287.0, + "step": 8078 + }, + { + "epoch": 0.8872172194157698, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.686896324157715, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7019057273864746, + "num_tokens": 209069545.0, + "step": 8079 + }, + { + "epoch": 0.8873270371183835, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2892391681671143, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6987693905830383, + "num_tokens": 209095028.0, + "step": 8080 + }, + { + "epoch": 0.8874368548209971, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.456413507461548, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7226777076721191, + "num_tokens": 209118193.0, + "step": 8081 + }, + { + "epoch": 0.8875466725236109, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2284634113311768, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7051024436950684, + "num_tokens": 209146980.0, + "step": 8082 + }, + { + "epoch": 0.8876564902262245, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2358546257019043, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6993773579597473, + "num_tokens": 209174812.0, + "step": 8083 + }, + { + "epoch": 0.8877663079288381, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3254759311676025, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7181529402732849, + "num_tokens": 209200809.0, + "step": 8084 + }, + { + "epoch": 0.8878761256314518, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2639377117156982, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7287650108337402, + "num_tokens": 209225031.0, + "step": 8085 + }, + { + "epoch": 0.8879859433340654, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2345869541168213, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.677642822265625, + "num_tokens": 209253581.0, + "step": 8086 + }, + { + "epoch": 0.8880957610366791, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.266796350479126, + "learning_rate": 1e-06, + "loss": 1.072, + "mean_token_accuracy": 0.6812437772750854, + "num_tokens": 209283539.0, + "step": 8087 + }, + { + "epoch": 0.8882055787392927, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.158092975616455, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6871296167373657, + "num_tokens": 209313896.0, + "step": 8088 + }, + { + "epoch": 0.8883153964419065, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.297471761703491, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7196352481842041, + "num_tokens": 209338742.0, + "step": 8089 + }, + { + "epoch": 0.8884252141445201, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.610945463180542, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7083736062049866, + "num_tokens": 209358816.0, + "step": 8090 + }, + { + "epoch": 0.8885350318471338, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3624842166900635, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7069535255432129, + "num_tokens": 209383834.0, + "step": 8091 + }, + { + "epoch": 0.8886448495497474, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.363784074783325, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6927819848060608, + "num_tokens": 209410554.0, + "step": 8092 + }, + { + "epoch": 0.8887546672523611, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.711413860321045, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7146375179290771, + "num_tokens": 209430014.0, + "step": 8093 + }, + { + "epoch": 0.8888644849549747, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.610745668411255, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7041354775428772, + "num_tokens": 209450901.0, + "step": 8094 + }, + { + "epoch": 0.8889743026575884, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.179989814758301, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6956494450569153, + "num_tokens": 209480423.0, + "step": 8095 + }, + { + "epoch": 0.889084120360202, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.226591110229492, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7151353359222412, + "num_tokens": 209509147.0, + "step": 8096 + }, + { + "epoch": 0.8891939380628158, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.166203260421753, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7121977210044861, + "num_tokens": 209539620.0, + "step": 8097 + }, + { + "epoch": 0.8893037557654294, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.6192920207977295, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7286111116409302, + "num_tokens": 209560177.0, + "step": 8098 + }, + { + "epoch": 0.8894135734680431, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.288477897644043, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7102795839309692, + "num_tokens": 209585869.0, + "step": 8099 + }, + { + "epoch": 0.8895233911706567, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.677766799926758, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6959009766578674, + "num_tokens": 209607506.0, + "step": 8100 + }, + { + "epoch": 0.8896332088732704, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4251456260681152, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6994861364364624, + "num_tokens": 209634975.0, + "step": 8101 + }, + { + "epoch": 0.889743026575884, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3997528553009033, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7059351205825806, + "num_tokens": 209661090.0, + "step": 8102 + }, + { + "epoch": 0.8898528442784976, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3998003005981445, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7001670002937317, + "num_tokens": 209687591.0, + "step": 8103 + }, + { + "epoch": 0.8899626619811114, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.545397996902466, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7136660814285278, + "num_tokens": 209709367.0, + "step": 8104 + }, + { + "epoch": 0.890072479683725, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.1463305950164795, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7061823606491089, + "num_tokens": 209740280.0, + "step": 8105 + }, + { + "epoch": 0.8901822973863387, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.287036180496216, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7245967388153076, + "num_tokens": 209766365.0, + "step": 8106 + }, + { + "epoch": 0.8902921150889523, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3790202140808105, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7051119208335876, + "num_tokens": 209792007.0, + "step": 8107 + }, + { + "epoch": 0.890401932791566, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.201622247695923, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7056275606155396, + "num_tokens": 209820382.0, + "step": 8108 + }, + { + "epoch": 0.8905117504941796, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4451982975006104, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.698337197303772, + "num_tokens": 209844582.0, + "step": 8109 + }, + { + "epoch": 0.8906215681967933, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.214707612991333, + "learning_rate": 1e-06, + "loss": 1.1352, + "mean_token_accuracy": 0.6675976514816284, + "num_tokens": 209876586.0, + "step": 8110 + }, + { + "epoch": 0.890731385899407, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.396080732345581, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7166545391082764, + "num_tokens": 209901208.0, + "step": 8111 + }, + { + "epoch": 0.8908412036020207, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.473141670227051, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7375472784042358, + "num_tokens": 209922889.0, + "step": 8112 + }, + { + "epoch": 0.8909510213046343, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3097972869873047, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7049630880355835, + "num_tokens": 209948901.0, + "step": 8113 + }, + { + "epoch": 0.891060839007248, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3860955238342285, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7055376172065735, + "num_tokens": 209972783.0, + "step": 8114 + }, + { + "epoch": 0.8911706567098616, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1320362091064453, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6793622970581055, + "num_tokens": 210003463.0, + "step": 8115 + }, + { + "epoch": 0.8912804744124753, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.416545867919922, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.707604169845581, + "num_tokens": 210026671.0, + "step": 8116 + }, + { + "epoch": 0.8913902921150889, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3409507274627686, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7229898571968079, + "num_tokens": 210049798.0, + "step": 8117 + }, + { + "epoch": 0.8915001098177027, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2864067554473877, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6954892873764038, + "num_tokens": 210076020.0, + "step": 8118 + }, + { + "epoch": 0.8916099275203163, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1879923343658447, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6874525547027588, + "num_tokens": 210106761.0, + "step": 8119 + }, + { + "epoch": 0.89171974522293, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3551716804504395, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7197485566139221, + "num_tokens": 210132344.0, + "step": 8120 + }, + { + "epoch": 0.8918295629255436, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.392021417617798, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.702096700668335, + "num_tokens": 210159175.0, + "step": 8121 + }, + { + "epoch": 0.8919393806281573, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.259509325027466, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6894475817680359, + "num_tokens": 210189422.0, + "step": 8122 + }, + { + "epoch": 0.8920491983307709, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.185297727584839, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6844691038131714, + "num_tokens": 210220355.0, + "step": 8123 + }, + { + "epoch": 0.8921590160333845, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4194133281707764, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7113385200500488, + "num_tokens": 210243366.0, + "step": 8124 + }, + { + "epoch": 0.8922688337359982, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.183070182800293, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6984869837760925, + "num_tokens": 210273857.0, + "step": 8125 + }, + { + "epoch": 0.892378651438612, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.549576997756958, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6903288960456848, + "num_tokens": 210298244.0, + "step": 8126 + }, + { + "epoch": 0.8924884691412256, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.226940393447876, + "learning_rate": 1e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.6828612089157104, + "num_tokens": 210329923.0, + "step": 8127 + }, + { + "epoch": 0.8925982868438392, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.172219753265381, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6995989084243774, + "num_tokens": 210358538.0, + "step": 8128 + }, + { + "epoch": 0.8927081045464529, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.214932918548584, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7060397267341614, + "num_tokens": 210386966.0, + "step": 8129 + }, + { + "epoch": 0.8928179222490665, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.319859743118286, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6909226179122925, + "num_tokens": 210414246.0, + "step": 8130 + }, + { + "epoch": 0.8929277399516802, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.5289101600646973, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6922935247421265, + "num_tokens": 210437784.0, + "step": 8131 + }, + { + "epoch": 0.8930375576542938, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.400332450866699, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6978238224983215, + "num_tokens": 210463775.0, + "step": 8132 + }, + { + "epoch": 0.8931473753569076, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.285599708557129, + "learning_rate": 1e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.6939988136291504, + "num_tokens": 210492247.0, + "step": 8133 + }, + { + "epoch": 0.8932571930595212, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4947891235351562, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.698620617389679, + "num_tokens": 210516236.0, + "step": 8134 + }, + { + "epoch": 0.8933670107621349, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.449040174484253, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.719355046749115, + "num_tokens": 210539167.0, + "step": 8135 + }, + { + "epoch": 0.8934768284647485, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3404362201690674, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7134491801261902, + "num_tokens": 210562684.0, + "step": 8136 + }, + { + "epoch": 0.8935866461673622, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.020986318588257, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.719693124294281, + "num_tokens": 210596516.0, + "step": 8137 + }, + { + "epoch": 0.8936964638699758, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2192323207855225, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7205990552902222, + "num_tokens": 210623295.0, + "step": 8138 + }, + { + "epoch": 0.8938062815725895, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.55942440032959, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7224724292755127, + "num_tokens": 210643221.0, + "step": 8139 + }, + { + "epoch": 0.8939160992752032, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2641232013702393, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6940585374832153, + "num_tokens": 210670898.0, + "step": 8140 + }, + { + "epoch": 0.8940259169778169, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.538663148880005, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7039082050323486, + "num_tokens": 210694162.0, + "step": 8141 + }, + { + "epoch": 0.8941357346804305, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3657419681549072, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6940902471542358, + "num_tokens": 210719930.0, + "step": 8142 + }, + { + "epoch": 0.8942455523830442, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.350456714630127, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6846862435340881, + "num_tokens": 210747367.0, + "step": 8143 + }, + { + "epoch": 0.8943553700856578, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3466997146606445, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.7060292959213257, + "num_tokens": 210773069.0, + "step": 8144 + }, + { + "epoch": 0.8944651877882714, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2598609924316406, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7103956937789917, + "num_tokens": 210801344.0, + "step": 8145 + }, + { + "epoch": 0.8945750054908851, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.1457481384277344, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7146831750869751, + "num_tokens": 210835548.0, + "step": 8146 + }, + { + "epoch": 0.8946848231934988, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.4304301738739014, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.722962498664856, + "num_tokens": 210857281.0, + "step": 8147 + }, + { + "epoch": 0.8947946408961125, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2489218711853027, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7043899297714233, + "num_tokens": 210884292.0, + "step": 8148 + }, + { + "epoch": 0.8949044585987261, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4074525833129883, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7049087882041931, + "num_tokens": 210908448.0, + "step": 8149 + }, + { + "epoch": 0.8950142763013398, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.6027164459228516, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6880724430084229, + "num_tokens": 210932876.0, + "step": 8150 + }, + { + "epoch": 0.8951240940039534, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.0842251777648926, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7122082710266113, + "num_tokens": 210963703.0, + "step": 8151 + }, + { + "epoch": 0.8952339117065671, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.346925735473633, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7073136568069458, + "num_tokens": 210990458.0, + "step": 8152 + }, + { + "epoch": 0.8953437294091807, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.165221929550171, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6907168626785278, + "num_tokens": 211021700.0, + "step": 8153 + }, + { + "epoch": 0.8954535471117944, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.512648105621338, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7031917572021484, + "num_tokens": 211044523.0, + "step": 8154 + }, + { + "epoch": 0.8955633648144081, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3741438388824463, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7083773016929626, + "num_tokens": 211067809.0, + "step": 8155 + }, + { + "epoch": 0.8956731825170218, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.385420560836792, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7016463279724121, + "num_tokens": 211093559.0, + "step": 8156 + }, + { + "epoch": 0.8957830002196354, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.2788798809051514, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7169627547264099, + "num_tokens": 211120767.0, + "step": 8157 + }, + { + "epoch": 0.8958928179222491, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3997206687927246, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7140780687332153, + "num_tokens": 211144996.0, + "step": 8158 + }, + { + "epoch": 0.8960026356248627, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.128811836242676, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7170419096946716, + "num_tokens": 211175424.0, + "step": 8159 + }, + { + "epoch": 0.8961124533274764, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.2140958309173584, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7038694024085999, + "num_tokens": 211204392.0, + "step": 8160 + }, + { + "epoch": 0.89622227103009, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.39878511428833, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.699601411819458, + "num_tokens": 211229550.0, + "step": 8161 + }, + { + "epoch": 0.8963320887327038, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.394484043121338, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7178938984870911, + "num_tokens": 211255545.0, + "step": 8162 + }, + { + "epoch": 0.8964419064353174, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.2194159030914307, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6961976289749146, + "num_tokens": 211283602.0, + "step": 8163 + }, + { + "epoch": 0.896551724137931, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.7675204277038574, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7047185301780701, + "num_tokens": 211302935.0, + "step": 8164 + }, + { + "epoch": 0.8966615418405447, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.859931468963623, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7049038410186768, + "num_tokens": 211321835.0, + "step": 8165 + }, + { + "epoch": 0.8967713595431583, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.1995885372161865, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6967896223068237, + "num_tokens": 211351891.0, + "step": 8166 + }, + { + "epoch": 0.896881177245772, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.5115842819213867, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6960296630859375, + "num_tokens": 211372892.0, + "step": 8167 + }, + { + "epoch": 0.8969909949483856, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4760582447052, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7154377698898315, + "num_tokens": 211394671.0, + "step": 8168 + }, + { + "epoch": 0.8971008126509994, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3777801990509033, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7106998562812805, + "num_tokens": 211422879.0, + "step": 8169 + }, + { + "epoch": 0.897210630353613, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.174273729324341, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7000441551208496, + "num_tokens": 211450565.0, + "step": 8170 + }, + { + "epoch": 0.8973204480562267, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.5080270767211914, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6997455954551697, + "num_tokens": 211471829.0, + "step": 8171 + }, + { + "epoch": 0.8974302657588403, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4946014881134033, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7220234870910645, + "num_tokens": 211495522.0, + "step": 8172 + }, + { + "epoch": 0.897540083461454, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3038992881774902, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7091238498687744, + "num_tokens": 211522860.0, + "step": 8173 + }, + { + "epoch": 0.8976499011640676, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4101722240448, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6950016617774963, + "num_tokens": 211548583.0, + "step": 8174 + }, + { + "epoch": 0.8977597188666813, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3748109340667725, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.704658031463623, + "num_tokens": 211572444.0, + "step": 8175 + }, + { + "epoch": 0.897869536569295, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.506941795349121, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7226155400276184, + "num_tokens": 211594694.0, + "step": 8176 + }, + { + "epoch": 0.8979793542719087, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.6975057125091553, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7029479146003723, + "num_tokens": 211614529.0, + "step": 8177 + }, + { + "epoch": 0.8980891719745223, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3375205993652344, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7111648917198181, + "num_tokens": 211640705.0, + "step": 8178 + }, + { + "epoch": 0.898198989677136, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4896984100341797, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7110381126403809, + "num_tokens": 211664393.0, + "step": 8179 + }, + { + "epoch": 0.8983088073797496, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.319601535797119, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7137974500656128, + "num_tokens": 211689982.0, + "step": 8180 + }, + { + "epoch": 0.8984186250823633, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.713169574737549, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7102165818214417, + "num_tokens": 211708693.0, + "step": 8181 + }, + { + "epoch": 0.8985284427849769, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.500631809234619, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6932369470596313, + "num_tokens": 211732231.0, + "step": 8182 + }, + { + "epoch": 0.8986382604875905, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.162590980529785, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7152117490768433, + "num_tokens": 211760191.0, + "step": 8183 + }, + { + "epoch": 0.8987480781902043, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3557353019714355, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6944547295570374, + "num_tokens": 211786463.0, + "step": 8184 + }, + { + "epoch": 0.898857895892818, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.2997281551361084, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7076513767242432, + "num_tokens": 211812926.0, + "step": 8185 + }, + { + "epoch": 0.8989677135954316, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4032998085021973, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7293609976768494, + "num_tokens": 211834368.0, + "step": 8186 + }, + { + "epoch": 0.8990775312980452, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.906667470932007, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7133183479309082, + "num_tokens": 211853863.0, + "step": 8187 + }, + { + "epoch": 0.8991873490006589, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.576101541519165, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6971702575683594, + "num_tokens": 211877620.0, + "step": 8188 + }, + { + "epoch": 0.8992971667032725, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.269916534423828, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6908001899719238, + "num_tokens": 211908189.0, + "step": 8189 + }, + { + "epoch": 0.8994069844058862, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4017367362976074, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6963559985160828, + "num_tokens": 211935489.0, + "step": 8190 + }, + { + "epoch": 0.8995168021084999, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4615371227264404, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7103453278541565, + "num_tokens": 211958611.0, + "step": 8191 + }, + { + "epoch": 0.8996266198111136, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.2963831424713135, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.705513596534729, + "num_tokens": 211983462.0, + "step": 8192 + }, + { + "epoch": 0.8997364375137272, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.205737829208374, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7216936945915222, + "num_tokens": 212011557.0, + "step": 8193 + }, + { + "epoch": 0.8998462552163409, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.043123960494995, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7246752381324768, + "num_tokens": 212042302.0, + "step": 8194 + }, + { + "epoch": 0.8999560729189545, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.392766237258911, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.7016645073890686, + "num_tokens": 212065357.0, + "step": 8195 + }, + { + "epoch": 0.9000658906215682, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.349973678588867, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7045139670372009, + "num_tokens": 212092573.0, + "step": 8196 + }, + { + "epoch": 0.9001757083241818, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2383980751037598, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6783564686775208, + "num_tokens": 212120452.0, + "step": 8197 + }, + { + "epoch": 0.9002855260267956, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3444862365722656, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7161649465560913, + "num_tokens": 212146205.0, + "step": 8198 + }, + { + "epoch": 0.9003953437294092, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.439556837081909, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7175673842430115, + "num_tokens": 212170022.0, + "step": 8199 + }, + { + "epoch": 0.9005051614320229, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.271707534790039, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7033119797706604, + "num_tokens": 212197763.0, + "step": 8200 + }, + { + "epoch": 0.9006149791346365, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.400017023086548, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6958513855934143, + "num_tokens": 212223065.0, + "step": 8201 + }, + { + "epoch": 0.9007247968372502, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.6661715507507324, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7039625644683838, + "num_tokens": 212245488.0, + "step": 8202 + }, + { + "epoch": 0.9008346145398638, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.2885608673095703, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6927824020385742, + "num_tokens": 212274724.0, + "step": 8203 + }, + { + "epoch": 0.9009444322424774, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.386753559112549, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7326348423957825, + "num_tokens": 212296850.0, + "step": 8204 + }, + { + "epoch": 0.9010542499450912, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.313560962677002, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.682922899723053, + "num_tokens": 212324861.0, + "step": 8205 + }, + { + "epoch": 0.9011640676477048, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3319737911224365, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.693520188331604, + "num_tokens": 212350599.0, + "step": 8206 + }, + { + "epoch": 0.9012738853503185, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.545396566390991, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7101642489433289, + "num_tokens": 212374498.0, + "step": 8207 + }, + { + "epoch": 0.9013837030529321, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4419987201690674, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6888043880462646, + "num_tokens": 212400734.0, + "step": 8208 + }, + { + "epoch": 0.9014935207555458, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.402799129486084, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7251487970352173, + "num_tokens": 212428540.0, + "step": 8209 + }, + { + "epoch": 0.9016033384581594, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.607218027114868, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7193952798843384, + "num_tokens": 212450911.0, + "step": 8210 + }, + { + "epoch": 0.9017131561607731, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.456428289413452, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7304641008377075, + "num_tokens": 212473053.0, + "step": 8211 + }, + { + "epoch": 0.9018229738633867, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.454059600830078, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6964492201805115, + "num_tokens": 212498830.0, + "step": 8212 + }, + { + "epoch": 0.9019327915660005, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.109621286392212, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6938742399215698, + "num_tokens": 212530401.0, + "step": 8213 + }, + { + "epoch": 0.9020426092686141, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.231663703918457, + "learning_rate": 1e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6864486932754517, + "num_tokens": 212558520.0, + "step": 8214 + }, + { + "epoch": 0.9021524269712278, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.314204216003418, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6909952163696289, + "num_tokens": 212586735.0, + "step": 8215 + }, + { + "epoch": 0.9022622446738414, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3814992904663086, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6949095129966736, + "num_tokens": 212613134.0, + "step": 8216 + }, + { + "epoch": 0.9023720623764551, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4146666526794434, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6977236270904541, + "num_tokens": 212638391.0, + "step": 8217 + }, + { + "epoch": 0.9024818800790687, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.4276487827301025, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7177741527557373, + "num_tokens": 212663725.0, + "step": 8218 + }, + { + "epoch": 0.9025916977816824, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.362562656402588, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.707855224609375, + "num_tokens": 212693559.0, + "step": 8219 + }, + { + "epoch": 0.9027015154842961, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.395254611968994, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.705897867679596, + "num_tokens": 212716262.0, + "step": 8220 + }, + { + "epoch": 0.9028113331869098, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.1368772983551025, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7135926485061646, + "num_tokens": 212747179.0, + "step": 8221 + }, + { + "epoch": 0.9029211508895234, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.223593235015869, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7075315713882446, + "num_tokens": 212774828.0, + "step": 8222 + }, + { + "epoch": 0.903030968592137, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.258509635925293, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6906554102897644, + "num_tokens": 212808125.0, + "step": 8223 + }, + { + "epoch": 0.9031407862947507, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3583006858825684, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6836308836936951, + "num_tokens": 212834200.0, + "step": 8224 + }, + { + "epoch": 0.9032506039973643, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.3178718090057373, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6810764074325562, + "num_tokens": 212861783.0, + "step": 8225 + }, + { + "epoch": 0.903360421699978, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.079066276550293, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6907737255096436, + "num_tokens": 212893688.0, + "step": 8226 + }, + { + "epoch": 0.9034702394025917, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.264981269836426, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7040059566497803, + "num_tokens": 212923304.0, + "step": 8227 + }, + { + "epoch": 0.9035800571052054, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.296076774597168, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6850423812866211, + "num_tokens": 212955307.0, + "step": 8228 + }, + { + "epoch": 0.903689874807819, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.494734764099121, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7060389518737793, + "num_tokens": 212976112.0, + "step": 8229 + }, + { + "epoch": 0.9037996925104327, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 2.7490992546081543, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7116674780845642, + "num_tokens": 212995197.0, + "step": 8230 + }, + { + "epoch": 0.9039095102130463, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.292695999145508, + "learning_rate": 1e-06, + "loss": 1.1377, + "mean_token_accuracy": 0.6725983619689941, + "num_tokens": 213024273.0, + "step": 8231 + }, + { + "epoch": 0.90401932791566, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1008286476135254, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7139239311218262, + "num_tokens": 213053233.0, + "step": 8232 + }, + { + "epoch": 0.9041291456182736, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.617292642593384, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7249650955200195, + "num_tokens": 213074531.0, + "step": 8233 + }, + { + "epoch": 0.9042389633208874, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.202897310256958, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7024883031845093, + "num_tokens": 213103128.0, + "step": 8234 + }, + { + "epoch": 0.904348781023501, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.580136775970459, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7117921113967896, + "num_tokens": 213124284.0, + "step": 8235 + }, + { + "epoch": 0.9044585987261147, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1032044887542725, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7361606359481812, + "num_tokens": 213153427.0, + "step": 8236 + }, + { + "epoch": 0.9045684164287283, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2902941703796387, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6936532258987427, + "num_tokens": 213180649.0, + "step": 8237 + }, + { + "epoch": 0.904678234131342, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3063087463378906, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6914482712745667, + "num_tokens": 213212215.0, + "step": 8238 + }, + { + "epoch": 0.9047880518339556, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1444318294525146, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6880676746368408, + "num_tokens": 213242563.0, + "step": 8239 + }, + { + "epoch": 0.9048978695365693, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3049674034118652, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6851081252098083, + "num_tokens": 213271046.0, + "step": 8240 + }, + { + "epoch": 0.9050076872391829, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.116537570953369, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7172277569770813, + "num_tokens": 213301255.0, + "step": 8241 + }, + { + "epoch": 0.9051175049417967, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.425442695617676, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7233645915985107, + "num_tokens": 213324833.0, + "step": 8242 + }, + { + "epoch": 0.9052273226444103, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.228217124938965, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.689194917678833, + "num_tokens": 213351024.0, + "step": 8243 + }, + { + "epoch": 0.905337140347024, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2427148818969727, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6869856119155884, + "num_tokens": 213379797.0, + "step": 8244 + }, + { + "epoch": 0.9054469580496376, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.187110424041748, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6845439672470093, + "num_tokens": 213409556.0, + "step": 8245 + }, + { + "epoch": 0.9055567757522512, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4035563468933105, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7062625885009766, + "num_tokens": 213433519.0, + "step": 8246 + }, + { + "epoch": 0.9056665934548649, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3451008796691895, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.690603494644165, + "num_tokens": 213462154.0, + "step": 8247 + }, + { + "epoch": 0.9057764111574785, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4588449001312256, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7088139057159424, + "num_tokens": 213485521.0, + "step": 8248 + }, + { + "epoch": 0.9058862288600923, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2065255641937256, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6988565921783447, + "num_tokens": 213514699.0, + "step": 8249 + }, + { + "epoch": 0.9059960465627059, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4388599395751953, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6936429142951965, + "num_tokens": 213539877.0, + "step": 8250 + }, + { + "epoch": 0.9061058642653196, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3180129528045654, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6950740814208984, + "num_tokens": 213567267.0, + "step": 8251 + }, + { + "epoch": 0.9062156819679332, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3729641437530518, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7139567732810974, + "num_tokens": 213593174.0, + "step": 8252 + }, + { + "epoch": 0.9063254996705469, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.352895975112915, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.7011822462081909, + "num_tokens": 213619039.0, + "step": 8253 + }, + { + "epoch": 0.9064353173731605, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4243524074554443, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7124977111816406, + "num_tokens": 213644412.0, + "step": 8254 + }, + { + "epoch": 0.9065451350757742, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.5304908752441406, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7093385457992554, + "num_tokens": 213668556.0, + "step": 8255 + }, + { + "epoch": 0.9066549527783879, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1430299282073975, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7425935864448547, + "num_tokens": 213694420.0, + "step": 8256 + }, + { + "epoch": 0.9067647704810016, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4627997875213623, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6864815950393677, + "num_tokens": 213721246.0, + "step": 8257 + }, + { + "epoch": 0.9068745881836152, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.252758026123047, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7137709856033325, + "num_tokens": 213747046.0, + "step": 8258 + }, + { + "epoch": 0.9069844058862289, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4614665508270264, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6984032392501831, + "num_tokens": 213771105.0, + "step": 8259 + }, + { + "epoch": 0.9070942235888425, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3032331466674805, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7214286923408508, + "num_tokens": 213795878.0, + "step": 8260 + }, + { + "epoch": 0.9072040412914562, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2379262447357178, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6911845803260803, + "num_tokens": 213824083.0, + "step": 8261 + }, + { + "epoch": 0.9073138589940698, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.252098560333252, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.71454918384552, + "num_tokens": 213851743.0, + "step": 8262 + }, + { + "epoch": 0.9074236766966836, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.6494922637939453, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7051857709884644, + "num_tokens": 213872726.0, + "step": 8263 + }, + { + "epoch": 0.9075334943992972, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.0382206439971924, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6981522440910339, + "num_tokens": 213904422.0, + "step": 8264 + }, + { + "epoch": 0.9076433121019108, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3598570823669434, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7059060335159302, + "num_tokens": 213929673.0, + "step": 8265 + }, + { + "epoch": 0.9077531298045245, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4732260704040527, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7111166715621948, + "num_tokens": 213952516.0, + "step": 8266 + }, + { + "epoch": 0.9078629475071381, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.087013006210327, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6905887722969055, + "num_tokens": 213982810.0, + "step": 8267 + }, + { + "epoch": 0.9079727652097518, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.292201042175293, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6829266548156738, + "num_tokens": 214010916.0, + "step": 8268 + }, + { + "epoch": 0.9080825829123654, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4466750621795654, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7017942667007446, + "num_tokens": 214035488.0, + "step": 8269 + }, + { + "epoch": 0.9081924006149792, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2353086471557617, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.720550537109375, + "num_tokens": 214064658.0, + "step": 8270 + }, + { + "epoch": 0.9083022183175928, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.361077308654785, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6994255781173706, + "num_tokens": 214090493.0, + "step": 8271 + }, + { + "epoch": 0.9084120360202065, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.67268705368042, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7186998724937439, + "num_tokens": 214109319.0, + "step": 8272 + }, + { + "epoch": 0.9085218537228201, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.288205146789551, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7035813927650452, + "num_tokens": 214135566.0, + "step": 8273 + }, + { + "epoch": 0.9086316714254338, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.392911911010742, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7165273427963257, + "num_tokens": 214158638.0, + "step": 8274 + }, + { + "epoch": 0.9087414891280474, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.32353138923645, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.694451093673706, + "num_tokens": 214186129.0, + "step": 8275 + }, + { + "epoch": 0.9088513068306611, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.212566375732422, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6948742270469666, + "num_tokens": 214216102.0, + "step": 8276 + }, + { + "epoch": 0.9089611245332747, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1700620651245117, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7009594440460205, + "num_tokens": 214244267.0, + "step": 8277 + }, + { + "epoch": 0.9090709422358885, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.105250358581543, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6918976306915283, + "num_tokens": 214274438.0, + "step": 8278 + }, + { + "epoch": 0.9091807599385021, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.092348575592041, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6900725364685059, + "num_tokens": 214306660.0, + "step": 8279 + }, + { + "epoch": 0.9092905776411158, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.422966241836548, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7415196895599365, + "num_tokens": 214331101.0, + "step": 8280 + }, + { + "epoch": 0.9094003953437294, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.2869603633880615, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.719139814376831, + "num_tokens": 214358743.0, + "step": 8281 + }, + { + "epoch": 0.909510213046343, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.299788475036621, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6975658535957336, + "num_tokens": 214386075.0, + "step": 8282 + }, + { + "epoch": 0.9096200307489567, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.8569304943084717, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7063046097755432, + "num_tokens": 214405293.0, + "step": 8283 + }, + { + "epoch": 0.9097298484515703, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.233959674835205, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.703761637210846, + "num_tokens": 214432685.0, + "step": 8284 + }, + { + "epoch": 0.9098396661541841, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.3426856994628906, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6991790533065796, + "num_tokens": 214458723.0, + "step": 8285 + }, + { + "epoch": 0.9099494838567977, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.269660234451294, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7127442955970764, + "num_tokens": 214486020.0, + "step": 8286 + }, + { + "epoch": 0.9100593015594114, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.5054967403411865, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.7024190425872803, + "num_tokens": 214509254.0, + "step": 8287 + }, + { + "epoch": 0.910169119262025, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1983354091644287, + "learning_rate": 1e-06, + "loss": 1.087, + "mean_token_accuracy": 0.6851403117179871, + "num_tokens": 214540063.0, + "step": 8288 + }, + { + "epoch": 0.9102789369646387, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.467473030090332, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6951099634170532, + "num_tokens": 214565092.0, + "step": 8289 + }, + { + "epoch": 0.9103887546672523, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.233778476715088, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7020103931427002, + "num_tokens": 214598113.0, + "step": 8290 + }, + { + "epoch": 0.910498572369866, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4442336559295654, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7070430517196655, + "num_tokens": 214621759.0, + "step": 8291 + }, + { + "epoch": 0.9106083900724797, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.324052333831787, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.718102216720581, + "num_tokens": 214646977.0, + "step": 8292 + }, + { + "epoch": 0.9107182077750934, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.5261244773864746, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7317790985107422, + "num_tokens": 214667009.0, + "step": 8293 + }, + { + "epoch": 0.910828025477707, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.4467108249664307, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6909682750701904, + "num_tokens": 214690203.0, + "step": 8294 + }, + { + "epoch": 0.9109378431803207, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.620537757873535, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7239048480987549, + "num_tokens": 214711257.0, + "step": 8295 + }, + { + "epoch": 0.9110476608829343, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.1322519779205322, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7026175260543823, + "num_tokens": 214740590.0, + "step": 8296 + }, + { + "epoch": 0.911157478585548, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.056276798248291, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7116274833679199, + "num_tokens": 214772673.0, + "step": 8297 + }, + { + "epoch": 0.9112672962881616, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.492431163787842, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6948873400688171, + "num_tokens": 214796315.0, + "step": 8298 + }, + { + "epoch": 0.9113771139907754, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3291079998016357, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6831434965133667, + "num_tokens": 214822372.0, + "step": 8299 + }, + { + "epoch": 0.911486931693389, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.0625598430633545, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6905178427696228, + "num_tokens": 214855213.0, + "step": 8300 + }, + { + "epoch": 0.9115967493960027, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.069776773452759, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7012597322463989, + "num_tokens": 214890209.0, + "step": 8301 + }, + { + "epoch": 0.9117065670986163, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.0961108207702637, + "learning_rate": 1e-06, + "loss": 1.0999, + "mean_token_accuracy": 0.684352695941925, + "num_tokens": 214924274.0, + "step": 8302 + }, + { + "epoch": 0.91181638480123, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.5303943157196045, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6957824230194092, + "num_tokens": 214946732.0, + "step": 8303 + }, + { + "epoch": 0.9119262025038436, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.71543025970459, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.728743314743042, + "num_tokens": 214965979.0, + "step": 8304 + }, + { + "epoch": 0.9120360202064572, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.5548980236053467, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.69917893409729, + "num_tokens": 214989147.0, + "step": 8305 + }, + { + "epoch": 0.9121458379090709, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.452258586883545, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7126615047454834, + "num_tokens": 215012035.0, + "step": 8306 + }, + { + "epoch": 0.9122556556116846, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.421261787414551, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7201312780380249, + "num_tokens": 215036677.0, + "step": 8307 + }, + { + "epoch": 0.9123654733142983, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 2.570601463317871, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7245721817016602, + "num_tokens": 215057133.0, + "step": 8308 + }, + { + "epoch": 0.9124752910169119, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.302927255630493, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6942733526229858, + "num_tokens": 215082831.0, + "step": 8309 + }, + { + "epoch": 0.9125851087195256, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3644330501556396, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7015894055366516, + "num_tokens": 215106983.0, + "step": 8310 + }, + { + "epoch": 0.9126949264221392, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.327841281890869, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7185394167900085, + "num_tokens": 215131950.0, + "step": 8311 + }, + { + "epoch": 0.9128047441247529, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.5275959968566895, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7043387293815613, + "num_tokens": 215153394.0, + "step": 8312 + }, + { + "epoch": 0.9129145618273665, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.112229824066162, + "learning_rate": 1e-06, + "loss": 1.0908, + "mean_token_accuracy": 0.6752637028694153, + "num_tokens": 215184587.0, + "step": 8313 + }, + { + "epoch": 0.9130243795299803, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2615418434143066, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6966608762741089, + "num_tokens": 215212894.0, + "step": 8314 + }, + { + "epoch": 0.9131341972325939, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2563304901123047, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6941404938697815, + "num_tokens": 215240789.0, + "step": 8315 + }, + { + "epoch": 0.9132440149352076, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.419271945953369, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6832970380783081, + "num_tokens": 215265364.0, + "step": 8316 + }, + { + "epoch": 0.9133538326378212, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.574758291244507, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7058541774749756, + "num_tokens": 215285525.0, + "step": 8317 + }, + { + "epoch": 0.9134636503404349, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.487550735473633, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7167563438415527, + "num_tokens": 215308355.0, + "step": 8318 + }, + { + "epoch": 0.9135734680430485, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.53182053565979, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6930941343307495, + "num_tokens": 215336239.0, + "step": 8319 + }, + { + "epoch": 0.9136832857456622, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.4670958518981934, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7049423456192017, + "num_tokens": 215360536.0, + "step": 8320 + }, + { + "epoch": 0.9137931034482759, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.371213436126709, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7065675258636475, + "num_tokens": 215386149.0, + "step": 8321 + }, + { + "epoch": 0.9139029211508896, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.6662580966949463, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7190965414047241, + "num_tokens": 215407934.0, + "step": 8322 + }, + { + "epoch": 0.9140127388535032, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2961339950561523, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7260248064994812, + "num_tokens": 215435377.0, + "step": 8323 + }, + { + "epoch": 0.9141225565561168, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.392483949661255, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6971054673194885, + "num_tokens": 215461261.0, + "step": 8324 + }, + { + "epoch": 0.9142323742587305, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.5086829662323, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7197055816650391, + "num_tokens": 215483834.0, + "step": 8325 + }, + { + "epoch": 0.9143421919613441, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3942039012908936, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6931953430175781, + "num_tokens": 215508844.0, + "step": 8326 + }, + { + "epoch": 0.9144520096639578, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.1909611225128174, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7275599837303162, + "num_tokens": 215536103.0, + "step": 8327 + }, + { + "epoch": 0.9145618273665715, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.635045051574707, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7247174978256226, + "num_tokens": 215555919.0, + "step": 8328 + }, + { + "epoch": 0.9146716450691852, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3617942333221436, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.705458402633667, + "num_tokens": 215580581.0, + "step": 8329 + }, + { + "epoch": 0.9147814627717988, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3538453578948975, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.720920205116272, + "num_tokens": 215605680.0, + "step": 8330 + }, + { + "epoch": 0.9148912804744125, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.1664669513702393, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.689826250076294, + "num_tokens": 215634982.0, + "step": 8331 + }, + { + "epoch": 0.9150010981770261, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.132385730743408, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6881299018859863, + "num_tokens": 215667249.0, + "step": 8332 + }, + { + "epoch": 0.9151109158796398, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3921828269958496, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.717570424079895, + "num_tokens": 215690494.0, + "step": 8333 + }, + { + "epoch": 0.9152207335822534, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2742817401885986, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.69244784116745, + "num_tokens": 215716541.0, + "step": 8334 + }, + { + "epoch": 0.9153305512848671, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.339998960494995, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7035430669784546, + "num_tokens": 215741018.0, + "step": 8335 + }, + { + "epoch": 0.9154403689874808, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3427205085754395, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7127261757850647, + "num_tokens": 215765566.0, + "step": 8336 + }, + { + "epoch": 0.9155501866900945, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.4801549911499023, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7166401147842407, + "num_tokens": 215789434.0, + "step": 8337 + }, + { + "epoch": 0.9156600043927081, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2514123916625977, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7137295603752136, + "num_tokens": 215816457.0, + "step": 8338 + }, + { + "epoch": 0.9157698220953218, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.564422607421875, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7203355431556702, + "num_tokens": 215839046.0, + "step": 8339 + }, + { + "epoch": 0.9158796397979354, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.374741315841675, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7222498655319214, + "num_tokens": 215861389.0, + "step": 8340 + }, + { + "epoch": 0.915989457500549, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.645362615585327, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7143198251724243, + "num_tokens": 215881662.0, + "step": 8341 + }, + { + "epoch": 0.9160992752031627, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.5868782997131348, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7217060327529907, + "num_tokens": 215901617.0, + "step": 8342 + }, + { + "epoch": 0.9162090929057765, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.303682565689087, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7110113501548767, + "num_tokens": 215927452.0, + "step": 8343 + }, + { + "epoch": 0.9163189106083901, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3218681812286377, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7129697203636169, + "num_tokens": 215953475.0, + "step": 8344 + }, + { + "epoch": 0.9164287283110037, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3971757888793945, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7002227306365967, + "num_tokens": 215978069.0, + "step": 8345 + }, + { + "epoch": 0.9165385460136174, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.177419424057007, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7144955396652222, + "num_tokens": 216005963.0, + "step": 8346 + }, + { + "epoch": 0.916648363716231, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3501930236816406, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7114592790603638, + "num_tokens": 216032217.0, + "step": 8347 + }, + { + "epoch": 0.9167581814188447, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2491488456726074, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6783702373504639, + "num_tokens": 216061479.0, + "step": 8348 + }, + { + "epoch": 0.9168679991214583, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2131102085113525, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7045998573303223, + "num_tokens": 216090079.0, + "step": 8349 + }, + { + "epoch": 0.9169778168240721, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4424166679382324, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6839655041694641, + "num_tokens": 216117217.0, + "step": 8350 + }, + { + "epoch": 0.9170876345266857, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1919023990631104, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7227458953857422, + "num_tokens": 216144407.0, + "step": 8351 + }, + { + "epoch": 0.9171974522292994, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4584548473358154, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7226799130439758, + "num_tokens": 216168373.0, + "step": 8352 + }, + { + "epoch": 0.917307269931913, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4451417922973633, + "learning_rate": 1e-06, + "loss": 1.1181, + "mean_token_accuracy": 0.6681678295135498, + "num_tokens": 216193251.0, + "step": 8353 + }, + { + "epoch": 0.9174170876345267, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5620288848876953, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7229408025741577, + "num_tokens": 216213242.0, + "step": 8354 + }, + { + "epoch": 0.9175269053371403, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.254903554916382, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7146467566490173, + "num_tokens": 216240426.0, + "step": 8355 + }, + { + "epoch": 0.917636723039754, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.1825637817382812, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6949376463890076, + "num_tokens": 216270015.0, + "step": 8356 + }, + { + "epoch": 0.9177465407423677, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.124879837036133, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7006745338439941, + "num_tokens": 216300090.0, + "step": 8357 + }, + { + "epoch": 0.9178563584449814, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.1152279376983643, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7076617479324341, + "num_tokens": 216330103.0, + "step": 8358 + }, + { + "epoch": 0.917966176147595, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.2114102840423584, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7159487009048462, + "num_tokens": 216356619.0, + "step": 8359 + }, + { + "epoch": 0.9180759938502087, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.470794200897217, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7054117321968079, + "num_tokens": 216379171.0, + "step": 8360 + }, + { + "epoch": 0.9181858115528223, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0910375118255615, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.691460907459259, + "num_tokens": 216409387.0, + "step": 8361 + }, + { + "epoch": 0.918295629255436, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3922476768493652, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7120220065116882, + "num_tokens": 216433635.0, + "step": 8362 + }, + { + "epoch": 0.9184054469580496, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.1089046001434326, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6950803995132446, + "num_tokens": 216466485.0, + "step": 8363 + }, + { + "epoch": 0.9185152646606632, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.488227605819702, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7125263214111328, + "num_tokens": 216488907.0, + "step": 8364 + }, + { + "epoch": 0.918625082363277, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0393543243408203, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6969239711761475, + "num_tokens": 216521259.0, + "step": 8365 + }, + { + "epoch": 0.9187349000658906, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.74920392036438, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7253882884979248, + "num_tokens": 216538864.0, + "step": 8366 + }, + { + "epoch": 0.9188447177685043, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.143568754196167, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7053536772727966, + "num_tokens": 216568362.0, + "step": 8367 + }, + { + "epoch": 0.9189545354711179, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.141890287399292, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7019404172897339, + "num_tokens": 216598722.0, + "step": 8368 + }, + { + "epoch": 0.9190643531737316, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.443655490875244, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6925609111785889, + "num_tokens": 216623241.0, + "step": 8369 + }, + { + "epoch": 0.9191741708763452, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2381653785705566, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7192622423171997, + "num_tokens": 216650065.0, + "step": 8370 + }, + { + "epoch": 0.9192839885789589, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3797264099121094, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.685973048210144, + "num_tokens": 216675984.0, + "step": 8371 + }, + { + "epoch": 0.9193938062815726, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.131934404373169, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.7022433876991272, + "num_tokens": 216706335.0, + "step": 8372 + }, + { + "epoch": 0.9195036239841863, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0671067237854004, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6829766035079956, + "num_tokens": 216738678.0, + "step": 8373 + }, + { + "epoch": 0.9196134416867999, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.136133909225464, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7608060836791992, + "num_tokens": 216764265.0, + "step": 8374 + }, + { + "epoch": 0.9197232593894136, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5083067417144775, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7276930212974548, + "num_tokens": 216787019.0, + "step": 8375 + }, + { + "epoch": 0.9198330770920272, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.38356351852417, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7039232850074768, + "num_tokens": 216810780.0, + "step": 8376 + }, + { + "epoch": 0.9199428947946409, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.524705648422241, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6892931461334229, + "num_tokens": 216833727.0, + "step": 8377 + }, + { + "epoch": 0.9200527124972545, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.35127329826355, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6855756044387817, + "num_tokens": 216860498.0, + "step": 8378 + }, + { + "epoch": 0.9201625301998683, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6711416244506836, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7211126089096069, + "num_tokens": 216880677.0, + "step": 8379 + }, + { + "epoch": 0.9202723479024819, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4240734577178955, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7104154825210571, + "num_tokens": 216903748.0, + "step": 8380 + }, + { + "epoch": 0.9203821656050956, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.399277448654175, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6884292960166931, + "num_tokens": 216935680.0, + "step": 8381 + }, + { + "epoch": 0.9204919833077092, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.424077033996582, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7045345306396484, + "num_tokens": 216960966.0, + "step": 8382 + }, + { + "epoch": 0.9206018010103229, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3362324237823486, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7137064933776855, + "num_tokens": 216984655.0, + "step": 8383 + }, + { + "epoch": 0.9207116187129365, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.120612859725952, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7246537208557129, + "num_tokens": 217013859.0, + "step": 8384 + }, + { + "epoch": 0.9208214364155501, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5132827758789062, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6883566379547119, + "num_tokens": 217039191.0, + "step": 8385 + }, + { + "epoch": 0.9209312541181639, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.383608341217041, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6998028755187988, + "num_tokens": 217065249.0, + "step": 8386 + }, + { + "epoch": 0.9210410718207775, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.330289602279663, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6947958469390869, + "num_tokens": 217092553.0, + "step": 8387 + }, + { + "epoch": 0.9211508895233912, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3895511627197266, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7082868814468384, + "num_tokens": 217117661.0, + "step": 8388 + }, + { + "epoch": 0.9212607072260048, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.304110527038574, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6814989447593689, + "num_tokens": 217146021.0, + "step": 8389 + }, + { + "epoch": 0.9213705249286185, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3690707683563232, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6937788724899292, + "num_tokens": 217171003.0, + "step": 8390 + }, + { + "epoch": 0.9214803426312321, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.389463424682617, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6984851956367493, + "num_tokens": 217194716.0, + "step": 8391 + }, + { + "epoch": 0.9215901603338458, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.332756280899048, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6903866529464722, + "num_tokens": 217220352.0, + "step": 8392 + }, + { + "epoch": 0.9216999780364594, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.3576390743255615, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7260565161705017, + "num_tokens": 217241491.0, + "step": 8393 + }, + { + "epoch": 0.9218097957390732, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4981508255004883, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7207503318786621, + "num_tokens": 217261562.0, + "step": 8394 + }, + { + "epoch": 0.9219196134416868, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1041882038116455, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7069183588027954, + "num_tokens": 217294792.0, + "step": 8395 + }, + { + "epoch": 0.9220294311443005, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2479422092437744, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7090190649032593, + "num_tokens": 217322645.0, + "step": 8396 + }, + { + "epoch": 0.9221392488469141, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.020507335662842, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6994205713272095, + "num_tokens": 217357087.0, + "step": 8397 + }, + { + "epoch": 0.9222490665495278, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.5077261924743652, + "learning_rate": 1e-06, + "loss": 1.1452, + "mean_token_accuracy": 0.6703426837921143, + "num_tokens": 217383181.0, + "step": 8398 + }, + { + "epoch": 0.9223588842521414, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 2.800260543823242, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7118653655052185, + "num_tokens": 217401323.0, + "step": 8399 + }, + { + "epoch": 0.922468701954755, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1067893505096436, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.697960615158081, + "num_tokens": 217430911.0, + "step": 8400 + }, + { + "epoch": 0.9225785196573688, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2146494388580322, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6911083459854126, + "num_tokens": 217460849.0, + "step": 8401 + }, + { + "epoch": 0.9226883373599825, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4312024116516113, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7141295671463013, + "num_tokens": 217487254.0, + "step": 8402 + }, + { + "epoch": 0.9227981550625961, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.326075315475464, + "learning_rate": 1e-06, + "loss": 1.1158, + "mean_token_accuracy": 0.6728842258453369, + "num_tokens": 217515029.0, + "step": 8403 + }, + { + "epoch": 0.9229079727652097, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4229657649993896, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.697064995765686, + "num_tokens": 217539472.0, + "step": 8404 + }, + { + "epoch": 0.9230177904678234, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2991230487823486, + "learning_rate": 1e-06, + "loss": 1.127, + "mean_token_accuracy": 0.6948683857917786, + "num_tokens": 217565277.0, + "step": 8405 + }, + { + "epoch": 0.923127608170437, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3126838207244873, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6981013417243958, + "num_tokens": 217591244.0, + "step": 8406 + }, + { + "epoch": 0.9232374258730507, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.225789785385132, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.702820360660553, + "num_tokens": 217619176.0, + "step": 8407 + }, + { + "epoch": 0.9233472435756644, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.158604383468628, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7191804647445679, + "num_tokens": 217646909.0, + "step": 8408 + }, + { + "epoch": 0.9234570612782781, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6066336631774902, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7065863609313965, + "num_tokens": 217667989.0, + "step": 8409 + }, + { + "epoch": 0.9235668789808917, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.271805763244629, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7203014492988586, + "num_tokens": 217694095.0, + "step": 8410 + }, + { + "epoch": 0.9236766966835054, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.249196767807007, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6814333200454712, + "num_tokens": 217723548.0, + "step": 8411 + }, + { + "epoch": 0.923786514386119, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.509279251098633, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7040823101997375, + "num_tokens": 217747359.0, + "step": 8412 + }, + { + "epoch": 0.9238963320887327, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.421699285507202, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6978731155395508, + "num_tokens": 217772721.0, + "step": 8413 + }, + { + "epoch": 0.9240061497913463, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.360684871673584, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6880295276641846, + "num_tokens": 217797517.0, + "step": 8414 + }, + { + "epoch": 0.9241159674939601, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1920278072357178, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6887206435203552, + "num_tokens": 217826190.0, + "step": 8415 + }, + { + "epoch": 0.9242257851965737, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.440011501312256, + "learning_rate": 1e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.6842796206474304, + "num_tokens": 217852227.0, + "step": 8416 + }, + { + "epoch": 0.9243356028991874, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2352352142333984, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.69915771484375, + "num_tokens": 217880720.0, + "step": 8417 + }, + { + "epoch": 0.924445420601801, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.634726047515869, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7046889066696167, + "num_tokens": 217901861.0, + "step": 8418 + }, + { + "epoch": 0.9245552383044147, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.153250217437744, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6903754472732544, + "num_tokens": 217931798.0, + "step": 8419 + }, + { + "epoch": 0.9246650560070283, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.284930467605591, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6884464025497437, + "num_tokens": 217959279.0, + "step": 8420 + }, + { + "epoch": 0.924774873709642, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.300037384033203, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7021238207817078, + "num_tokens": 217984120.0, + "step": 8421 + }, + { + "epoch": 0.9248846914122557, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.292466402053833, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7281112670898438, + "num_tokens": 218010085.0, + "step": 8422 + }, + { + "epoch": 0.9249945091148694, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0620765686035156, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7005412578582764, + "num_tokens": 218042423.0, + "step": 8423 + }, + { + "epoch": 0.925104326817483, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4595189094543457, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6963883638381958, + "num_tokens": 218067588.0, + "step": 8424 + }, + { + "epoch": 0.9252141445200966, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0566818714141846, + "learning_rate": 1e-06, + "loss": 1.1772, + "mean_token_accuracy": 0.6593383550643921, + "num_tokens": 218101759.0, + "step": 8425 + }, + { + "epoch": 0.9253239622227103, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.265516996383667, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6987146139144897, + "num_tokens": 218129303.0, + "step": 8426 + }, + { + "epoch": 0.9254337799253239, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4154443740844727, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6980298757553101, + "num_tokens": 218154835.0, + "step": 8427 + }, + { + "epoch": 0.9255435976279376, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2426376342773438, + "learning_rate": 1e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.6815403699874878, + "num_tokens": 218183636.0, + "step": 8428 + }, + { + "epoch": 0.9256534153305512, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5144262313842773, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.709443211555481, + "num_tokens": 218207019.0, + "step": 8429 + }, + { + "epoch": 0.925763233033165, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.497448444366455, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7145816087722778, + "num_tokens": 218229583.0, + "step": 8430 + }, + { + "epoch": 0.9258730507357786, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3278722763061523, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6997765898704529, + "num_tokens": 218256194.0, + "step": 8431 + }, + { + "epoch": 0.9259828684383923, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5119712352752686, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6824796199798584, + "num_tokens": 218280046.0, + "step": 8432 + }, + { + "epoch": 0.9260926861410059, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.866495132446289, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7166714668273926, + "num_tokens": 218299251.0, + "step": 8433 + }, + { + "epoch": 0.9262025038436196, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4147186279296875, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.710247278213501, + "num_tokens": 218323170.0, + "step": 8434 + }, + { + "epoch": 0.9263123215462332, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5074455738067627, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.708672046661377, + "num_tokens": 218346012.0, + "step": 8435 + }, + { + "epoch": 0.9264221392488469, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.561027765274048, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7171235084533691, + "num_tokens": 218368155.0, + "step": 8436 + }, + { + "epoch": 0.9265319569514606, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.228501081466675, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6971089243888855, + "num_tokens": 218394595.0, + "step": 8437 + }, + { + "epoch": 0.9266417746540743, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.428781270980835, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7023855447769165, + "num_tokens": 218419929.0, + "step": 8438 + }, + { + "epoch": 0.9267515923566879, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.47294020652771, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7330517172813416, + "num_tokens": 218441566.0, + "step": 8439 + }, + { + "epoch": 0.9268614100593016, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5355160236358643, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7150404453277588, + "num_tokens": 218464451.0, + "step": 8440 + }, + { + "epoch": 0.9269712277619152, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2920730113983154, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6983349323272705, + "num_tokens": 218492548.0, + "step": 8441 + }, + { + "epoch": 0.9270810454645289, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5769944190979004, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7065838575363159, + "num_tokens": 218519220.0, + "step": 8442 + }, + { + "epoch": 0.9271908631671425, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5807650089263916, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6967633962631226, + "num_tokens": 218541685.0, + "step": 8443 + }, + { + "epoch": 0.9273006808697563, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4314842224121094, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.72220778465271, + "num_tokens": 218565855.0, + "step": 8444 + }, + { + "epoch": 0.9274104985723699, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.301727771759033, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6887269020080566, + "num_tokens": 218592633.0, + "step": 8445 + }, + { + "epoch": 0.9275203162749835, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4092607498168945, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.704949140548706, + "num_tokens": 218618417.0, + "step": 8446 + }, + { + "epoch": 0.9276301339775972, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4175851345062256, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7188653349876404, + "num_tokens": 218641999.0, + "step": 8447 + }, + { + "epoch": 0.9277399516802108, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2236602306365967, + "learning_rate": 1e-06, + "loss": 1.0982, + "mean_token_accuracy": 0.6809601187705994, + "num_tokens": 218670529.0, + "step": 8448 + }, + { + "epoch": 0.9278497693828245, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5126090049743652, + "learning_rate": 1e-06, + "loss": 1.118, + "mean_token_accuracy": 0.681526780128479, + "num_tokens": 218695484.0, + "step": 8449 + }, + { + "epoch": 0.9279595870854381, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1272289752960205, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.7023438215255737, + "num_tokens": 218727725.0, + "step": 8450 + }, + { + "epoch": 0.9280694047880519, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.201035499572754, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.716907799243927, + "num_tokens": 218754505.0, + "step": 8451 + }, + { + "epoch": 0.9281792224906655, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.351787567138672, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7063072323799133, + "num_tokens": 218778201.0, + "step": 8452 + }, + { + "epoch": 0.9282890401932792, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4756932258605957, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7145928144454956, + "num_tokens": 218801071.0, + "step": 8453 + }, + { + "epoch": 0.9283988578958928, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5159730911254883, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7000641822814941, + "num_tokens": 218821290.0, + "step": 8454 + }, + { + "epoch": 0.9285086755985065, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.250697374343872, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7180392146110535, + "num_tokens": 218847790.0, + "step": 8455 + }, + { + "epoch": 0.9286184933011201, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.348571538925171, + "learning_rate": 1e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6789778470993042, + "num_tokens": 218872798.0, + "step": 8456 + }, + { + "epoch": 0.9287283110037338, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4724984169006348, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6943807005882263, + "num_tokens": 218896411.0, + "step": 8457 + }, + { + "epoch": 0.9288381287063474, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.448017120361328, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6979958415031433, + "num_tokens": 218919199.0, + "step": 8458 + }, + { + "epoch": 0.9289479464089612, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.303208351135254, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7026690244674683, + "num_tokens": 218947282.0, + "step": 8459 + }, + { + "epoch": 0.9290577641115748, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4103286266326904, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7029872536659241, + "num_tokens": 218970788.0, + "step": 8460 + }, + { + "epoch": 0.9291675818141885, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1696712970733643, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7152895927429199, + "num_tokens": 219001070.0, + "step": 8461 + }, + { + "epoch": 0.9292773995168021, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3862509727478027, + "learning_rate": 1e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6745715141296387, + "num_tokens": 219029358.0, + "step": 8462 + }, + { + "epoch": 0.9293872172194158, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3566699028015137, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7262516617774963, + "num_tokens": 219055155.0, + "step": 8463 + }, + { + "epoch": 0.9294970349220294, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0496554374694824, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6855154037475586, + "num_tokens": 219088784.0, + "step": 8464 + }, + { + "epoch": 0.929606852624643, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1481988430023193, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7178093791007996, + "num_tokens": 219119790.0, + "step": 8465 + }, + { + "epoch": 0.9297166703272568, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.385577440261841, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7087530493736267, + "num_tokens": 219142801.0, + "step": 8466 + }, + { + "epoch": 0.9298264880298704, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.472564458847046, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7269630432128906, + "num_tokens": 219168134.0, + "step": 8467 + }, + { + "epoch": 0.9299363057324841, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.454015016555786, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7286680936813354, + "num_tokens": 219191748.0, + "step": 8468 + }, + { + "epoch": 0.9300461234350977, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.704324960708618, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.695510745048523, + "num_tokens": 219214834.0, + "step": 8469 + }, + { + "epoch": 0.9301559411377114, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3210620880126953, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7108564972877502, + "num_tokens": 219243265.0, + "step": 8470 + }, + { + "epoch": 0.930265758840325, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0187430381774902, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.714470386505127, + "num_tokens": 219275706.0, + "step": 8471 + }, + { + "epoch": 0.9303755765429387, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.258517026901245, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.6696317791938782, + "num_tokens": 219305391.0, + "step": 8472 + }, + { + "epoch": 0.9304853942455524, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4135518074035645, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6932788491249084, + "num_tokens": 219331399.0, + "step": 8473 + }, + { + "epoch": 0.9305952119481661, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2069098949432373, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7260125875473022, + "num_tokens": 219358967.0, + "step": 8474 + }, + { + "epoch": 0.9307050296507797, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.0145606994628906, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.698630154132843, + "num_tokens": 219394839.0, + "step": 8475 + }, + { + "epoch": 0.9308148473533934, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6934916973114014, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7155306339263916, + "num_tokens": 219413954.0, + "step": 8476 + }, + { + "epoch": 0.930924665056007, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.243499755859375, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7025481462478638, + "num_tokens": 219440917.0, + "step": 8477 + }, + { + "epoch": 0.9310344827586207, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2364041805267334, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.684134840965271, + "num_tokens": 219469897.0, + "step": 8478 + }, + { + "epoch": 0.9311443004612343, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4599642753601074, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.709189772605896, + "num_tokens": 219491251.0, + "step": 8479 + }, + { + "epoch": 0.9312541181638481, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.51267147064209, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7043257355690002, + "num_tokens": 219512466.0, + "step": 8480 + }, + { + "epoch": 0.9313639358664617, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2710115909576416, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7044787406921387, + "num_tokens": 219539230.0, + "step": 8481 + }, + { + "epoch": 0.9314737535690754, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2262022495269775, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7109681367874146, + "num_tokens": 219567470.0, + "step": 8482 + }, + { + "epoch": 0.931583571271689, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.199922800064087, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.709750771522522, + "num_tokens": 219595741.0, + "step": 8483 + }, + { + "epoch": 0.9316933889743026, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3867366313934326, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7220356464385986, + "num_tokens": 219620741.0, + "step": 8484 + }, + { + "epoch": 0.9318032066769163, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3122777938842773, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7001888751983643, + "num_tokens": 219648414.0, + "step": 8485 + }, + { + "epoch": 0.9319130243795299, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2061195373535156, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6932799816131592, + "num_tokens": 219675933.0, + "step": 8486 + }, + { + "epoch": 0.9320228420821436, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3528642654418945, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6952846050262451, + "num_tokens": 219702289.0, + "step": 8487 + }, + { + "epoch": 0.9321326597847573, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4589662551879883, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.695449709892273, + "num_tokens": 219724969.0, + "step": 8488 + }, + { + "epoch": 0.932242477487371, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6489977836608887, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7234378457069397, + "num_tokens": 219745247.0, + "step": 8489 + }, + { + "epoch": 0.9323522951899846, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6514456272125244, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7170358896255493, + "num_tokens": 219767121.0, + "step": 8490 + }, + { + "epoch": 0.9324621128925983, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.290567636489868, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.704261302947998, + "num_tokens": 219793110.0, + "step": 8491 + }, + { + "epoch": 0.9325719305952119, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.352513313293457, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7153077125549316, + "num_tokens": 219817431.0, + "step": 8492 + }, + { + "epoch": 0.9326817482978256, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4210803508758545, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7099663019180298, + "num_tokens": 219840958.0, + "step": 8493 + }, + { + "epoch": 0.9327915660004392, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5802414417266846, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7004122734069824, + "num_tokens": 219864220.0, + "step": 8494 + }, + { + "epoch": 0.932901383703053, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5645737648010254, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7227654457092285, + "num_tokens": 219885350.0, + "step": 8495 + }, + { + "epoch": 0.9330112014056666, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.324667453765869, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7016451954841614, + "num_tokens": 219910183.0, + "step": 8496 + }, + { + "epoch": 0.9331210191082803, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4980835914611816, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.709716796875, + "num_tokens": 219935143.0, + "step": 8497 + }, + { + "epoch": 0.9332308368108939, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.395886182785034, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6841069459915161, + "num_tokens": 219962271.0, + "step": 8498 + }, + { + "epoch": 0.9333406545135076, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.0740163326263428, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6824278831481934, + "num_tokens": 219996977.0, + "step": 8499 + }, + { + "epoch": 0.9334504722161212, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.6340763568878174, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7067337036132812, + "num_tokens": 220019017.0, + "step": 8500 + }, + { + "epoch": 0.9335602899187349, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.239917278289795, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7179349660873413, + "num_tokens": 220044241.0, + "step": 8501 + }, + { + "epoch": 0.9336701076213486, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.566636085510254, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6948279142379761, + "num_tokens": 220066675.0, + "step": 8502 + }, + { + "epoch": 0.9337799253239623, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5645406246185303, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.720115602016449, + "num_tokens": 220087579.0, + "step": 8503 + }, + { + "epoch": 0.9338897430265759, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.907750368118286, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.70882648229599, + "num_tokens": 220105762.0, + "step": 8504 + }, + { + "epoch": 0.9339995607291895, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 4.225744247436523, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7080705761909485, + "num_tokens": 220127377.0, + "step": 8505 + }, + { + "epoch": 0.9341093784318032, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2734107971191406, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7164249420166016, + "num_tokens": 220156035.0, + "step": 8506 + }, + { + "epoch": 0.9342191961344168, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2039430141448975, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7075004577636719, + "num_tokens": 220186364.0, + "step": 8507 + }, + { + "epoch": 0.9343290138370305, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2109999656677246, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6875125765800476, + "num_tokens": 220216923.0, + "step": 8508 + }, + { + "epoch": 0.9344388315396442, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2652556896209717, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.682150661945343, + "num_tokens": 220243427.0, + "step": 8509 + }, + { + "epoch": 0.9345486492422579, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2742018699645996, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.6785248517990112, + "num_tokens": 220270601.0, + "step": 8510 + }, + { + "epoch": 0.9346584669448715, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2639808654785156, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6990113258361816, + "num_tokens": 220299537.0, + "step": 8511 + }, + { + "epoch": 0.9347682846474852, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 3.076650381088257, + "learning_rate": 1e-06, + "loss": 1.1085, + "mean_token_accuracy": 0.6757831573486328, + "num_tokens": 220325252.0, + "step": 8512 + }, + { + "epoch": 0.9348781023500988, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2020487785339355, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7091661095619202, + "num_tokens": 220351982.0, + "step": 8513 + }, + { + "epoch": 0.9349879200527125, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.171987295150757, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.702497661113739, + "num_tokens": 220381152.0, + "step": 8514 + }, + { + "epoch": 0.9350977377553261, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.7079904079437256, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7278337478637695, + "num_tokens": 220399949.0, + "step": 8515 + }, + { + "epoch": 0.9352075554579398, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.040633201599121, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7081041932106018, + "num_tokens": 220432231.0, + "step": 8516 + }, + { + "epoch": 0.9353173731605535, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4563257694244385, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7150205969810486, + "num_tokens": 220455363.0, + "step": 8517 + }, + { + "epoch": 0.9354271908631672, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.171544313430786, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6944777965545654, + "num_tokens": 220485388.0, + "step": 8518 + }, + { + "epoch": 0.9355370085657808, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.306745767593384, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6947172284126282, + "num_tokens": 220510686.0, + "step": 8519 + }, + { + "epoch": 0.9356468262683945, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.263280153274536, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6969342231750488, + "num_tokens": 220538470.0, + "step": 8520 + }, + { + "epoch": 0.9357566439710081, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.466798782348633, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7257128357887268, + "num_tokens": 220559701.0, + "step": 8521 + }, + { + "epoch": 0.9358664616736218, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.2757272720336914, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7231410145759583, + "num_tokens": 220587287.0, + "step": 8522 + }, + { + "epoch": 0.9359762793762354, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.337317943572998, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.7007262110710144, + "num_tokens": 220612885.0, + "step": 8523 + }, + { + "epoch": 0.9360860970788492, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1716060638427734, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6812108159065247, + "num_tokens": 220642379.0, + "step": 8524 + }, + { + "epoch": 0.9361959147814628, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1964919567108154, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6963595151901245, + "num_tokens": 220670345.0, + "step": 8525 + }, + { + "epoch": 0.9363057324840764, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4094860553741455, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6834794878959656, + "num_tokens": 220695748.0, + "step": 8526 + }, + { + "epoch": 0.9364155501866901, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.1491777896881104, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6944953203201294, + "num_tokens": 220725066.0, + "step": 8527 + }, + { + "epoch": 0.9365253678893037, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.3835582733154297, + "learning_rate": 1e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6887015700340271, + "num_tokens": 220750281.0, + "step": 8528 + }, + { + "epoch": 0.9366351855919174, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.312784433364868, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7042759656906128, + "num_tokens": 220778300.0, + "step": 8529 + }, + { + "epoch": 0.936745003294531, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.346088171005249, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7046216130256653, + "num_tokens": 220803943.0, + "step": 8530 + }, + { + "epoch": 0.9368548209971448, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.283601760864258, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7200761437416077, + "num_tokens": 220827576.0, + "step": 8531 + }, + { + "epoch": 0.9369646386997584, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.6566708087921143, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6917707920074463, + "num_tokens": 220849305.0, + "step": 8532 + }, + { + "epoch": 0.9370744564023721, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4258389472961426, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6879811882972717, + "num_tokens": 220875563.0, + "step": 8533 + }, + { + "epoch": 0.9371842741049857, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.4370949268341064, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7071762084960938, + "num_tokens": 220898919.0, + "step": 8534 + }, + { + "epoch": 0.9372940918075994, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.30938720703125, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6923074722290039, + "num_tokens": 220925388.0, + "step": 8535 + }, + { + "epoch": 0.937403909510213, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5949745178222656, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7037159204483032, + "num_tokens": 220946587.0, + "step": 8536 + }, + { + "epoch": 0.9375137272128267, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.39336895942688, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6955736875534058, + "num_tokens": 220970097.0, + "step": 8537 + }, + { + "epoch": 0.9376235449154404, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2378346920013428, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6910548210144043, + "num_tokens": 220999343.0, + "step": 8538 + }, + { + "epoch": 0.9377333626180541, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.289625406265259, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7025799751281738, + "num_tokens": 221027252.0, + "step": 8539 + }, + { + "epoch": 0.9378431803206677, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.5227420330047607, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6984801292419434, + "num_tokens": 221052625.0, + "step": 8540 + }, + { + "epoch": 0.9379529980232814, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.466907262802124, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7032291889190674, + "num_tokens": 221076412.0, + "step": 8541 + }, + { + "epoch": 0.938062815725895, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3015239238739014, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7207533121109009, + "num_tokens": 221104149.0, + "step": 8542 + }, + { + "epoch": 0.9381726334285087, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1504640579223633, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6980457901954651, + "num_tokens": 221132434.0, + "step": 8543 + }, + { + "epoch": 0.9382824511311223, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.302922248840332, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7032264471054077, + "num_tokens": 221157135.0, + "step": 8544 + }, + { + "epoch": 0.9383922688337359, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.585313320159912, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7198343276977539, + "num_tokens": 221177932.0, + "step": 8545 + }, + { + "epoch": 0.9385020865363497, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 2.531346082687378, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7078084945678711, + "num_tokens": 221200677.0, + "step": 8546 + }, + { + "epoch": 0.9386119042389633, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2556917667388916, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7094488143920898, + "num_tokens": 221226510.0, + "step": 8547 + }, + { + "epoch": 0.938721721941577, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.289916753768921, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6801724433898926, + "num_tokens": 221254321.0, + "step": 8548 + }, + { + "epoch": 0.9388315396441906, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4784128665924072, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7018561363220215, + "num_tokens": 221277301.0, + "step": 8549 + }, + { + "epoch": 0.9389413573468043, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5677573680877686, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7242861390113831, + "num_tokens": 221298457.0, + "step": 8550 + }, + { + "epoch": 0.9390511750494179, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3419957160949707, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7076458930969238, + "num_tokens": 221323999.0, + "step": 8551 + }, + { + "epoch": 0.9391609927520316, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2327218055725098, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7172389030456543, + "num_tokens": 221350861.0, + "step": 8552 + }, + { + "epoch": 0.9392708104546453, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.328676223754883, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6949027180671692, + "num_tokens": 221379066.0, + "step": 8553 + }, + { + "epoch": 0.939380628157259, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3976542949676514, + "learning_rate": 1e-06, + "loss": 1.1332, + "mean_token_accuracy": 0.6726565957069397, + "num_tokens": 221405645.0, + "step": 8554 + }, + { + "epoch": 0.9394904458598726, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1216044425964355, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7033697962760925, + "num_tokens": 221434456.0, + "step": 8555 + }, + { + "epoch": 0.9396002635624863, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.464951276779175, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7215819954872131, + "num_tokens": 221455517.0, + "step": 8556 + }, + { + "epoch": 0.9397100812650999, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.651323080062866, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7080654501914978, + "num_tokens": 221475373.0, + "step": 8557 + }, + { + "epoch": 0.9398198989677136, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.336625814437866, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.708379864692688, + "num_tokens": 221503028.0, + "step": 8558 + }, + { + "epoch": 0.9399297166703272, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4963538646698, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7129452228546143, + "num_tokens": 221526144.0, + "step": 8559 + }, + { + "epoch": 0.940039534372941, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.60029673576355, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6921225190162659, + "num_tokens": 221547183.0, + "step": 8560 + }, + { + "epoch": 0.9401493520755546, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4189939498901367, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7469466924667358, + "num_tokens": 221569164.0, + "step": 8561 + }, + { + "epoch": 0.9402591697781683, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2231156826019287, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7031646966934204, + "num_tokens": 221597868.0, + "step": 8562 + }, + { + "epoch": 0.9403689874807819, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.411219596862793, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7007223963737488, + "num_tokens": 221624517.0, + "step": 8563 + }, + { + "epoch": 0.9404788051833955, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1472785472869873, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6990463733673096, + "num_tokens": 221654198.0, + "step": 8564 + }, + { + "epoch": 0.9405886228860092, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.62391996383667, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7098641395568848, + "num_tokens": 221676354.0, + "step": 8565 + }, + { + "epoch": 0.9406984405886228, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.065627336502075, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.6769033670425415, + "num_tokens": 221711475.0, + "step": 8566 + }, + { + "epoch": 0.9408082582912366, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.304290771484375, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7160526514053345, + "num_tokens": 221738072.0, + "step": 8567 + }, + { + "epoch": 0.9409180759938502, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.241356611251831, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6901962757110596, + "num_tokens": 221765201.0, + "step": 8568 + }, + { + "epoch": 0.9410278936964639, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3371217250823975, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7024155855178833, + "num_tokens": 221789983.0, + "step": 8569 + }, + { + "epoch": 0.9411377113990775, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2109673023223877, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7077769637107849, + "num_tokens": 221818675.0, + "step": 8570 + }, + { + "epoch": 0.9412475291016912, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1564412117004395, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6930088400840759, + "num_tokens": 221849966.0, + "step": 8571 + }, + { + "epoch": 0.9413573468043048, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2711288928985596, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7039370536804199, + "num_tokens": 221877967.0, + "step": 8572 + }, + { + "epoch": 0.9414671645069185, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.368384599685669, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7281403541564941, + "num_tokens": 221901832.0, + "step": 8573 + }, + { + "epoch": 0.9415769822095322, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.463275671005249, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7046325206756592, + "num_tokens": 221926364.0, + "step": 8574 + }, + { + "epoch": 0.9416867999121459, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.393223285675049, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7197502851486206, + "num_tokens": 221949997.0, + "step": 8575 + }, + { + "epoch": 0.9417966176147595, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.8215651512145996, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.711322009563446, + "num_tokens": 221972166.0, + "step": 8576 + }, + { + "epoch": 0.9419064353173732, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.213332414627075, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.7003943920135498, + "num_tokens": 222000577.0, + "step": 8577 + }, + { + "epoch": 0.9420162530199868, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2795939445495605, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7110369205474854, + "num_tokens": 222028566.0, + "step": 8578 + }, + { + "epoch": 0.9421260707226005, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.398115396499634, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7240320444107056, + "num_tokens": 222052475.0, + "step": 8579 + }, + { + "epoch": 0.9422358884252141, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.167947292327881, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6953022480010986, + "num_tokens": 222080988.0, + "step": 8580 + }, + { + "epoch": 0.9423457061278278, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 3.7788147926330566, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.7009053230285645, + "num_tokens": 222108697.0, + "step": 8581 + }, + { + "epoch": 0.9424555238304415, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.791724920272827, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7190617322921753, + "num_tokens": 222126981.0, + "step": 8582 + }, + { + "epoch": 0.9425653415330552, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.242285966873169, + "learning_rate": 1e-06, + "loss": 1.1554, + "mean_token_accuracy": 0.6676938533782959, + "num_tokens": 222155767.0, + "step": 8583 + }, + { + "epoch": 0.9426751592356688, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.7252423763275146, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6963138580322266, + "num_tokens": 222177533.0, + "step": 8584 + }, + { + "epoch": 0.9427849769382824, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3199827671051025, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7129478454589844, + "num_tokens": 222201844.0, + "step": 8585 + }, + { + "epoch": 0.9428947946408961, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4790544509887695, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.71238774061203, + "num_tokens": 222224495.0, + "step": 8586 + }, + { + "epoch": 0.9430046123435097, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.8434970378875732, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.700585126876831, + "num_tokens": 222243560.0, + "step": 8587 + }, + { + "epoch": 0.9431144300461234, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1587235927581787, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7186096906661987, + "num_tokens": 222274129.0, + "step": 8588 + }, + { + "epoch": 0.9432242477487371, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1627063751220703, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7176763415336609, + "num_tokens": 222305237.0, + "step": 8589 + }, + { + "epoch": 0.9433340654513508, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.598832607269287, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7005088925361633, + "num_tokens": 222328774.0, + "step": 8590 + }, + { + "epoch": 0.9434438831539644, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.315725803375244, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7221812605857849, + "num_tokens": 222352974.0, + "step": 8591 + }, + { + "epoch": 0.9435537008565781, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4604713916778564, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7000339031219482, + "num_tokens": 222375481.0, + "step": 8592 + }, + { + "epoch": 0.9436635185591917, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4171371459960938, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7009391188621521, + "num_tokens": 222399146.0, + "step": 8593 + }, + { + "epoch": 0.9437733362618054, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.0350430011749268, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6918622255325317, + "num_tokens": 222432584.0, + "step": 8594 + }, + { + "epoch": 0.943883153964419, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 3.962087631225586, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7027517557144165, + "num_tokens": 222454366.0, + "step": 8595 + }, + { + "epoch": 0.9439929716670328, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.400590419769287, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7042098641395569, + "num_tokens": 222479174.0, + "step": 8596 + }, + { + "epoch": 0.9441027893696464, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3309271335601807, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7214863300323486, + "num_tokens": 222503908.0, + "step": 8597 + }, + { + "epoch": 0.9442126070722601, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.7644450664520264, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7385320067405701, + "num_tokens": 222520848.0, + "step": 8598 + }, + { + "epoch": 0.9443224247748737, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.659900426864624, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.725760281085968, + "num_tokens": 222538364.0, + "step": 8599 + }, + { + "epoch": 0.9444322424774874, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3471691608428955, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6903510093688965, + "num_tokens": 222565875.0, + "step": 8600 + }, + { + "epoch": 0.944542060180101, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.466601610183716, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.679813027381897, + "num_tokens": 222591050.0, + "step": 8601 + }, + { + "epoch": 0.9446518778827147, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3521201610565186, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7124128341674805, + "num_tokens": 222616289.0, + "step": 8602 + }, + { + "epoch": 0.9447616955853284, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1939468383789062, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7014331817626953, + "num_tokens": 222642934.0, + "step": 8603 + }, + { + "epoch": 0.944871513287942, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3553619384765625, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7299161553382874, + "num_tokens": 222669971.0, + "step": 8604 + }, + { + "epoch": 0.9449813309905557, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4610002040863037, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7134349346160889, + "num_tokens": 222693886.0, + "step": 8605 + }, + { + "epoch": 0.9450911486931693, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4312453269958496, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.699864387512207, + "num_tokens": 222718751.0, + "step": 8606 + }, + { + "epoch": 0.945200966395783, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.6368799209594727, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7239311933517456, + "num_tokens": 222739723.0, + "step": 8607 + }, + { + "epoch": 0.9453107840983966, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4527275562286377, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7131792902946472, + "num_tokens": 222764431.0, + "step": 8608 + }, + { + "epoch": 0.9454206018010103, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.610887289047241, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7171038389205933, + "num_tokens": 222786067.0, + "step": 8609 + }, + { + "epoch": 0.9455304195036239, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.353778600692749, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6852163672447205, + "num_tokens": 222811849.0, + "step": 8610 + }, + { + "epoch": 0.9456402372062377, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5234715938568115, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6866151094436646, + "num_tokens": 222837098.0, + "step": 8611 + }, + { + "epoch": 0.9457500549088513, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.31453800201416, + "learning_rate": 1e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.687263548374176, + "num_tokens": 222867209.0, + "step": 8612 + }, + { + "epoch": 0.945859872611465, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.45683217048645, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6809265613555908, + "num_tokens": 222891024.0, + "step": 8613 + }, + { + "epoch": 0.9459696903140786, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3558874130249023, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7133810520172119, + "num_tokens": 222914495.0, + "step": 8614 + }, + { + "epoch": 0.9460795080166923, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.660897970199585, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7169032096862793, + "num_tokens": 222933415.0, + "step": 8615 + }, + { + "epoch": 0.9461893257193059, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.530867338180542, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7054752111434937, + "num_tokens": 222956258.0, + "step": 8616 + }, + { + "epoch": 0.9462991434219196, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.624227523803711, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7152541279792786, + "num_tokens": 222976109.0, + "step": 8617 + }, + { + "epoch": 0.9464089611245333, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.376910924911499, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6999930143356323, + "num_tokens": 223002419.0, + "step": 8618 + }, + { + "epoch": 0.946518778827147, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2669451236724854, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6983089447021484, + "num_tokens": 223029426.0, + "step": 8619 + }, + { + "epoch": 0.9466285965297606, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.281752109527588, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7215473055839539, + "num_tokens": 223055601.0, + "step": 8620 + }, + { + "epoch": 0.9467384142323743, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.18687105178833, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6857010722160339, + "num_tokens": 223086401.0, + "step": 8621 + }, + { + "epoch": 0.9468482319349879, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.374863386154175, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7027579545974731, + "num_tokens": 223112709.0, + "step": 8622 + }, + { + "epoch": 0.9469580496376016, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5575180053710938, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7101901769638062, + "num_tokens": 223132726.0, + "step": 8623 + }, + { + "epoch": 0.9470678673402152, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.771944046020508, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7234770059585571, + "num_tokens": 223151442.0, + "step": 8624 + }, + { + "epoch": 0.947177685042829, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.469914197921753, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7015752792358398, + "num_tokens": 223174241.0, + "step": 8625 + }, + { + "epoch": 0.9472875027454426, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4942407608032227, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7180328369140625, + "num_tokens": 223196993.0, + "step": 8626 + }, + { + "epoch": 0.9473973204480562, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1188154220581055, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6972413063049316, + "num_tokens": 223226771.0, + "step": 8627 + }, + { + "epoch": 0.9475071381506699, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.6150033473968506, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7110577821731567, + "num_tokens": 223249487.0, + "step": 8628 + }, + { + "epoch": 0.9476169558532835, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.34527325630188, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6956608891487122, + "num_tokens": 223275629.0, + "step": 8629 + }, + { + "epoch": 0.9477267735558972, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.270709991455078, + "learning_rate": 1e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.6868196725845337, + "num_tokens": 223304025.0, + "step": 8630 + }, + { + "epoch": 0.9478365912585108, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.33970046043396, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.701369047164917, + "num_tokens": 223333077.0, + "step": 8631 + }, + { + "epoch": 0.9479464089611246, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5064024925231934, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6970832347869873, + "num_tokens": 223356647.0, + "step": 8632 + }, + { + "epoch": 0.9480562266637382, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.8438079357147217, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7246870994567871, + "num_tokens": 223374548.0, + "step": 8633 + }, + { + "epoch": 0.9481660443663519, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3601925373077393, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7167537808418274, + "num_tokens": 223400800.0, + "step": 8634 + }, + { + "epoch": 0.9482758620689655, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.847996234893799, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7238730788230896, + "num_tokens": 223419768.0, + "step": 8635 + }, + { + "epoch": 0.9483856797715792, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2497222423553467, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7135114669799805, + "num_tokens": 223446443.0, + "step": 8636 + }, + { + "epoch": 0.9484954974741928, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2311508655548096, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7158972024917603, + "num_tokens": 223473510.0, + "step": 8637 + }, + { + "epoch": 0.9486053151768065, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3465628623962402, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7341195344924927, + "num_tokens": 223503095.0, + "step": 8638 + }, + { + "epoch": 0.9487151328794201, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.281198740005493, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6947723031044006, + "num_tokens": 223532652.0, + "step": 8639 + }, + { + "epoch": 0.9488249505820339, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2617180347442627, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7125678062438965, + "num_tokens": 223560980.0, + "step": 8640 + }, + { + "epoch": 0.9489347682846475, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.282400369644165, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7250235080718994, + "num_tokens": 223586348.0, + "step": 8641 + }, + { + "epoch": 0.9490445859872612, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.6534199714660645, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7081870436668396, + "num_tokens": 223607708.0, + "step": 8642 + }, + { + "epoch": 0.9491544036898748, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.620095729827881, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7289379835128784, + "num_tokens": 223628312.0, + "step": 8643 + }, + { + "epoch": 0.9492642213924884, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.256887197494507, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7045786380767822, + "num_tokens": 223656073.0, + "step": 8644 + }, + { + "epoch": 0.9493740390951021, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.343125104904175, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7086049318313599, + "num_tokens": 223681909.0, + "step": 8645 + }, + { + "epoch": 0.9494838567977157, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.4637176990509033, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6905504465103149, + "num_tokens": 223706150.0, + "step": 8646 + }, + { + "epoch": 0.9495936745003295, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4245166778564453, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7022643089294434, + "num_tokens": 223730524.0, + "step": 8647 + }, + { + "epoch": 0.9497034922029431, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.819038152694702, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7339129447937012, + "num_tokens": 223748705.0, + "step": 8648 + }, + { + "epoch": 0.9498133099055568, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1297361850738525, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6990975737571716, + "num_tokens": 223784072.0, + "step": 8649 + }, + { + "epoch": 0.9499231276081704, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2739295959472656, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7342665195465088, + "num_tokens": 223811086.0, + "step": 8650 + }, + { + "epoch": 0.9500329453107841, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.485442638397217, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7138000726699829, + "num_tokens": 223835135.0, + "step": 8651 + }, + { + "epoch": 0.9501427630133977, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.39886474609375, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6876599192619324, + "num_tokens": 223861787.0, + "step": 8652 + }, + { + "epoch": 0.9502525807160114, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1822290420532227, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6846257448196411, + "num_tokens": 223891162.0, + "step": 8653 + }, + { + "epoch": 0.9503623984186251, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.513201951980591, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7169442772865295, + "num_tokens": 223912433.0, + "step": 8654 + }, + { + "epoch": 0.9504722161212388, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.493375778198242, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6843411922454834, + "num_tokens": 223938108.0, + "step": 8655 + }, + { + "epoch": 0.9505820338238524, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2595374584198, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6834593415260315, + "num_tokens": 223965637.0, + "step": 8656 + }, + { + "epoch": 0.9506918515264661, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.760467767715454, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7095980644226074, + "num_tokens": 223985296.0, + "step": 8657 + }, + { + "epoch": 0.9508016692290797, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2510364055633545, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6956198811531067, + "num_tokens": 224013684.0, + "step": 8658 + }, + { + "epoch": 0.9509114869316934, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.224935531616211, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6998728513717651, + "num_tokens": 224041925.0, + "step": 8659 + }, + { + "epoch": 0.951021304634307, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.542388916015625, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.6996785998344421, + "num_tokens": 224062807.0, + "step": 8660 + }, + { + "epoch": 0.9511311223369208, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.248215675354004, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.702612042427063, + "num_tokens": 224090444.0, + "step": 8661 + }, + { + "epoch": 0.9512409400395344, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.6334378719329834, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.690769374370575, + "num_tokens": 224115116.0, + "step": 8662 + }, + { + "epoch": 0.9513507577421481, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2296862602233887, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6877816319465637, + "num_tokens": 224144631.0, + "step": 8663 + }, + { + "epoch": 0.9514605754447617, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3255207538604736, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.70252525806427, + "num_tokens": 224173130.0, + "step": 8664 + }, + { + "epoch": 0.9515703931473753, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5329935550689697, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7251909971237183, + "num_tokens": 224196040.0, + "step": 8665 + }, + { + "epoch": 0.951680210849989, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2661375999450684, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7338570952415466, + "num_tokens": 224221596.0, + "step": 8666 + }, + { + "epoch": 0.9517900285526026, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.6629457473754883, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7130470275878906, + "num_tokens": 224247333.0, + "step": 8667 + }, + { + "epoch": 0.9518998462552163, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3270022869110107, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7160903811454773, + "num_tokens": 224274178.0, + "step": 8668 + }, + { + "epoch": 0.95200966395783, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.167262077331543, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6939924955368042, + "num_tokens": 224304474.0, + "step": 8669 + }, + { + "epoch": 0.9521194816604437, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.448521852493286, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.6839423179626465, + "num_tokens": 224329214.0, + "step": 8670 + }, + { + "epoch": 0.9522292993630573, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3571455478668213, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.7034904360771179, + "num_tokens": 224353752.0, + "step": 8671 + }, + { + "epoch": 0.952339117065671, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.866299867630005, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7323224544525146, + "num_tokens": 224369261.0, + "step": 8672 + }, + { + "epoch": 0.9524489347682846, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3062872886657715, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.703960657119751, + "num_tokens": 224397273.0, + "step": 8673 + }, + { + "epoch": 0.9525587524708983, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.509288787841797, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7071254849433899, + "num_tokens": 224419111.0, + "step": 8674 + }, + { + "epoch": 0.9526685701735119, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3231794834136963, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7100753784179688, + "num_tokens": 224445100.0, + "step": 8675 + }, + { + "epoch": 0.9527783878761257, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.369920015335083, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7237785458564758, + "num_tokens": 224469047.0, + "step": 8676 + }, + { + "epoch": 0.9528882055787393, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.479574680328369, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7158522605895996, + "num_tokens": 224491089.0, + "step": 8677 + }, + { + "epoch": 0.952998023281353, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3546533584594727, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7003166675567627, + "num_tokens": 224515134.0, + "step": 8678 + }, + { + "epoch": 0.9531078409839666, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1927196979522705, + "learning_rate": 1e-06, + "loss": 1.1147, + "mean_token_accuracy": 0.6749832034111023, + "num_tokens": 224548089.0, + "step": 8679 + }, + { + "epoch": 0.9532176586865803, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.339831829071045, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7066571712493896, + "num_tokens": 224571705.0, + "step": 8680 + }, + { + "epoch": 0.9533274763891939, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3585610389709473, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6862381100654602, + "num_tokens": 224596889.0, + "step": 8681 + }, + { + "epoch": 0.9534372940918076, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.9575860500335693, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7132428884506226, + "num_tokens": 224632269.0, + "step": 8682 + }, + { + "epoch": 0.9535471117944213, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.224485397338867, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6970731616020203, + "num_tokens": 224659968.0, + "step": 8683 + }, + { + "epoch": 0.953656929497035, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1546788215637207, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7137571573257446, + "num_tokens": 224689653.0, + "step": 8684 + }, + { + "epoch": 0.9537667471996486, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.341008186340332, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6917667388916016, + "num_tokens": 224717652.0, + "step": 8685 + }, + { + "epoch": 0.9538765649022622, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.077026605606079, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6886364221572876, + "num_tokens": 224751602.0, + "step": 8686 + }, + { + "epoch": 0.9539863826048759, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4441449642181396, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6927856206893921, + "num_tokens": 224775812.0, + "step": 8687 + }, + { + "epoch": 0.9540962003074895, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.404017925262451, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.72037672996521, + "num_tokens": 224798433.0, + "step": 8688 + }, + { + "epoch": 0.9542060180101032, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.252946615219116, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6937664747238159, + "num_tokens": 224826605.0, + "step": 8689 + }, + { + "epoch": 0.9543158357127169, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.339017868041992, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7223238945007324, + "num_tokens": 224850608.0, + "step": 8690 + }, + { + "epoch": 0.9544256534153306, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.6264805793762207, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.704067587852478, + "num_tokens": 224872229.0, + "step": 8691 + }, + { + "epoch": 0.9545354711179442, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2354307174682617, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7186794281005859, + "num_tokens": 224899062.0, + "step": 8692 + }, + { + "epoch": 0.9546452888205579, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.608124256134033, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7116957902908325, + "num_tokens": 224920953.0, + "step": 8693 + }, + { + "epoch": 0.9547551065231715, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5092296600341797, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6959090232849121, + "num_tokens": 224945827.0, + "step": 8694 + }, + { + "epoch": 0.9548649242257852, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2526955604553223, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7006794214248657, + "num_tokens": 224975422.0, + "step": 8695 + }, + { + "epoch": 0.9549747419283988, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.368434190750122, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6949564814567566, + "num_tokens": 225001592.0, + "step": 8696 + }, + { + "epoch": 0.9550845596310125, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.7634682655334473, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7184324264526367, + "num_tokens": 225022802.0, + "step": 8697 + }, + { + "epoch": 0.9551943773336262, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.420064687728882, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6847655177116394, + "num_tokens": 225048842.0, + "step": 8698 + }, + { + "epoch": 0.9553041950362399, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1095094680786133, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7055941820144653, + "num_tokens": 225080619.0, + "step": 8699 + }, + { + "epoch": 0.9554140127388535, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.118741989135742, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7072227001190186, + "num_tokens": 225110529.0, + "step": 8700 + }, + { + "epoch": 0.9555238304414672, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3457512855529785, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7332785129547119, + "num_tokens": 225136463.0, + "step": 8701 + }, + { + "epoch": 0.9556336481440808, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.4601855278015137, + "learning_rate": 1e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.6830337047576904, + "num_tokens": 225161023.0, + "step": 8702 + }, + { + "epoch": 0.9557434658466945, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.2738287448883057, + "learning_rate": 1e-06, + "loss": 1.1466, + "mean_token_accuracy": 0.6689051985740662, + "num_tokens": 225188822.0, + "step": 8703 + }, + { + "epoch": 0.9558532835493081, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.1531898975372314, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7030360698699951, + "num_tokens": 225219426.0, + "step": 8704 + }, + { + "epoch": 0.9559631012519219, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5369365215301514, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7079368829727173, + "num_tokens": 225241096.0, + "step": 8705 + }, + { + "epoch": 0.9560729189545355, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3572311401367188, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7012868523597717, + "num_tokens": 225264747.0, + "step": 8706 + }, + { + "epoch": 0.9561827366571491, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.340291738510132, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7043969035148621, + "num_tokens": 225287683.0, + "step": 8707 + }, + { + "epoch": 0.9562925543597628, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.193319797515869, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6875048875808716, + "num_tokens": 225316574.0, + "step": 8708 + }, + { + "epoch": 0.9564023720623764, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.3031692504882812, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7057359218597412, + "num_tokens": 225343257.0, + "step": 8709 + }, + { + "epoch": 0.9565121897649901, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 2.5457582473754883, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7360806465148926, + "num_tokens": 225366372.0, + "step": 8710 + }, + { + "epoch": 0.9566220074676037, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2586002349853516, + "learning_rate": 1e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.6855779886245728, + "num_tokens": 225397796.0, + "step": 8711 + }, + { + "epoch": 0.9567318251702175, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.224990129470825, + "learning_rate": 1e-06, + "loss": 1.1607, + "mean_token_accuracy": 0.6733821630477905, + "num_tokens": 225431624.0, + "step": 8712 + }, + { + "epoch": 0.9568416428728311, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.215944528579712, + "learning_rate": 1e-06, + "loss": 1.1239, + "mean_token_accuracy": 0.6780853867530823, + "num_tokens": 225459819.0, + "step": 8713 + }, + { + "epoch": 0.9569514605754448, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.385854482650757, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7042851448059082, + "num_tokens": 225487993.0, + "step": 8714 + }, + { + "epoch": 0.9570612782780584, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.306121826171875, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7146235704421997, + "num_tokens": 225513334.0, + "step": 8715 + }, + { + "epoch": 0.9571710959806721, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3943212032318115, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7082586884498596, + "num_tokens": 225537623.0, + "step": 8716 + }, + { + "epoch": 0.9572809136832857, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.6787612438201904, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7057483196258545, + "num_tokens": 225556885.0, + "step": 8717 + }, + { + "epoch": 0.9573907313858994, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.493309736251831, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6975630521774292, + "num_tokens": 225582847.0, + "step": 8718 + }, + { + "epoch": 0.9575005490885131, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.8787455558776855, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7324908375740051, + "num_tokens": 225598872.0, + "step": 8719 + }, + { + "epoch": 0.9576103667911268, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.4898123741149902, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6931443214416504, + "num_tokens": 225624823.0, + "step": 8720 + }, + { + "epoch": 0.9577201844937404, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.4342260360717773, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6972487568855286, + "num_tokens": 225649755.0, + "step": 8721 + }, + { + "epoch": 0.9578300021963541, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.0299696922302246, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6852681040763855, + "num_tokens": 225683143.0, + "step": 8722 + }, + { + "epoch": 0.9579398198989677, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.0214297771453857, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.719063401222229, + "num_tokens": 225714652.0, + "step": 8723 + }, + { + "epoch": 0.9580496376015813, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.525158643722534, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7174120545387268, + "num_tokens": 225737649.0, + "step": 8724 + }, + { + "epoch": 0.958159455304195, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.306962490081787, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7144247889518738, + "num_tokens": 225762807.0, + "step": 8725 + }, + { + "epoch": 0.9582692730068088, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2649741172790527, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6831717491149902, + "num_tokens": 225790806.0, + "step": 8726 + }, + { + "epoch": 0.9583790907094224, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2556936740875244, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7095974087715149, + "num_tokens": 225818475.0, + "step": 8727 + }, + { + "epoch": 0.958488908412036, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.623370885848999, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.7081304788589478, + "num_tokens": 225838758.0, + "step": 8728 + }, + { + "epoch": 0.9585987261146497, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.156362771987915, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7065603733062744, + "num_tokens": 225869813.0, + "step": 8729 + }, + { + "epoch": 0.9587085438172633, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.400744915008545, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.714045524597168, + "num_tokens": 225893794.0, + "step": 8730 + }, + { + "epoch": 0.958818361519877, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2920844554901123, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6947896480560303, + "num_tokens": 225920531.0, + "step": 8731 + }, + { + "epoch": 0.9589281792224906, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.1282122135162354, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7083106637001038, + "num_tokens": 225951936.0, + "step": 8732 + }, + { + "epoch": 0.9590379969251043, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.584882974624634, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7115368843078613, + "num_tokens": 225974222.0, + "step": 8733 + }, + { + "epoch": 0.959147814627718, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3848390579223633, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7207213640213013, + "num_tokens": 225997213.0, + "step": 8734 + }, + { + "epoch": 0.9592576323303317, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.574817180633545, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.700393557548523, + "num_tokens": 226017402.0, + "step": 8735 + }, + { + "epoch": 0.9593674500329453, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.316087245941162, + "learning_rate": 1e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6827431917190552, + "num_tokens": 226046124.0, + "step": 8736 + }, + { + "epoch": 0.959477267735559, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3370399475097656, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6964917778968811, + "num_tokens": 226072832.0, + "step": 8737 + }, + { + "epoch": 0.9595870854381726, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.1740336418151855, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.710224986076355, + "num_tokens": 226099350.0, + "step": 8738 + }, + { + "epoch": 0.9596969031407863, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3822553157806396, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7056452035903931, + "num_tokens": 226123899.0, + "step": 8739 + }, + { + "epoch": 0.9598067208433999, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2487587928771973, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7076900005340576, + "num_tokens": 226151450.0, + "step": 8740 + }, + { + "epoch": 0.9599165385460137, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.225102424621582, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7148394584655762, + "num_tokens": 226180002.0, + "step": 8741 + }, + { + "epoch": 0.9600263562486273, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3049752712249756, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6935849189758301, + "num_tokens": 226207642.0, + "step": 8742 + }, + { + "epoch": 0.960136173951241, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.506680965423584, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7036541700363159, + "num_tokens": 226233048.0, + "step": 8743 + }, + { + "epoch": 0.9602459916538546, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.483344078063965, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7224978804588318, + "num_tokens": 226254190.0, + "step": 8744 + }, + { + "epoch": 0.9603558093564682, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2178125381469727, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6997346878051758, + "num_tokens": 226280390.0, + "step": 8745 + }, + { + "epoch": 0.9604656270590819, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.1629278659820557, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.701543927192688, + "num_tokens": 226308489.0, + "step": 8746 + }, + { + "epoch": 0.9605754447616955, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.282668352127075, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7119795083999634, + "num_tokens": 226334100.0, + "step": 8747 + }, + { + "epoch": 0.9606852624643093, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.587512254714966, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7182016372680664, + "num_tokens": 226355150.0, + "step": 8748 + }, + { + "epoch": 0.9607950801669229, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2393252849578857, + "learning_rate": 1e-06, + "loss": 1.1047, + "mean_token_accuracy": 0.6800525188446045, + "num_tokens": 226385231.0, + "step": 8749 + }, + { + "epoch": 0.9609048978695366, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1191723346710205, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7070028185844421, + "num_tokens": 226416209.0, + "step": 8750 + }, + { + "epoch": 0.9610147155721502, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.712559223175049, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7098239064216614, + "num_tokens": 226435635.0, + "step": 8751 + }, + { + "epoch": 0.9611245332747639, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.063209056854248, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6995155811309814, + "num_tokens": 226468132.0, + "step": 8752 + }, + { + "epoch": 0.9612343509773775, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.351541519165039, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7226577401161194, + "num_tokens": 226492763.0, + "step": 8753 + }, + { + "epoch": 0.9613441686799912, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3671011924743652, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7060853242874146, + "num_tokens": 226518240.0, + "step": 8754 + }, + { + "epoch": 0.9614539863826049, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.6323230266571045, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7111164331436157, + "num_tokens": 226538071.0, + "step": 8755 + }, + { + "epoch": 0.9615638040852186, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.0457119941711426, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7042968273162842, + "num_tokens": 226567951.0, + "step": 8756 + }, + { + "epoch": 0.9616736217878322, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.8200621604919434, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7088741064071655, + "num_tokens": 226585283.0, + "step": 8757 + }, + { + "epoch": 0.9617834394904459, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1607446670532227, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7006793022155762, + "num_tokens": 226614720.0, + "step": 8758 + }, + { + "epoch": 0.9618932571930595, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.624929904937744, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.7056794166564941, + "num_tokens": 226636638.0, + "step": 8759 + }, + { + "epoch": 0.9620030748956732, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.264362096786499, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6923665404319763, + "num_tokens": 226664693.0, + "step": 8760 + }, + { + "epoch": 0.9621128925982868, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.352656126022339, + "learning_rate": 1e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6750420928001404, + "num_tokens": 226691569.0, + "step": 8761 + }, + { + "epoch": 0.9622227103009005, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.775773048400879, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7236005663871765, + "num_tokens": 226708968.0, + "step": 8762 + }, + { + "epoch": 0.9623325280035142, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.1401138305664062, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6900831460952759, + "num_tokens": 226739371.0, + "step": 8763 + }, + { + "epoch": 0.9624423457061279, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3815722465515137, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6942126154899597, + "num_tokens": 226765112.0, + "step": 8764 + }, + { + "epoch": 0.9625521634087415, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.3255672454833984, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.701829195022583, + "num_tokens": 226790665.0, + "step": 8765 + }, + { + "epoch": 0.9626619811113551, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1866753101348877, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7036714553833008, + "num_tokens": 226820430.0, + "step": 8766 + }, + { + "epoch": 0.9627717988139688, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.139601469039917, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6901423335075378, + "num_tokens": 226852716.0, + "step": 8767 + }, + { + "epoch": 0.9628816165165824, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2270822525024414, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7127153873443604, + "num_tokens": 226880918.0, + "step": 8768 + }, + { + "epoch": 0.9629914342191961, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.4382951259613037, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7004915475845337, + "num_tokens": 226904758.0, + "step": 8769 + }, + { + "epoch": 0.9631012519218098, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2290265560150146, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6898777484893799, + "num_tokens": 226933464.0, + "step": 8770 + }, + { + "epoch": 0.9632110696244235, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.199225902557373, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7132613658905029, + "num_tokens": 226961442.0, + "step": 8771 + }, + { + "epoch": 0.9633208873270371, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.376551866531372, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6972900629043579, + "num_tokens": 226987718.0, + "step": 8772 + }, + { + "epoch": 0.9634307050296508, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3113205432891846, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6971464157104492, + "num_tokens": 227014707.0, + "step": 8773 + }, + { + "epoch": 0.9635405227322644, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2150461673736572, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6933717727661133, + "num_tokens": 227042658.0, + "step": 8774 + }, + { + "epoch": 0.9636503404348781, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.428339719772339, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7144776582717896, + "num_tokens": 227065980.0, + "step": 8775 + }, + { + "epoch": 0.9637601581374917, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.308659315109253, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6833587288856506, + "num_tokens": 227092798.0, + "step": 8776 + }, + { + "epoch": 0.9638699758401055, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2971174716949463, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6951984167098999, + "num_tokens": 227119037.0, + "step": 8777 + }, + { + "epoch": 0.9639797935427191, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1692397594451904, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7094240784645081, + "num_tokens": 227147385.0, + "step": 8778 + }, + { + "epoch": 0.9640896112453328, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2419893741607666, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6974320411682129, + "num_tokens": 227175846.0, + "step": 8779 + }, + { + "epoch": 0.9641994289479464, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2630341053009033, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6910462975502014, + "num_tokens": 227203360.0, + "step": 8780 + }, + { + "epoch": 0.9643092466505601, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.370192766189575, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6977677345275879, + "num_tokens": 227228358.0, + "step": 8781 + }, + { + "epoch": 0.9644190643531737, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.195512533187866, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7207722067832947, + "num_tokens": 227255492.0, + "step": 8782 + }, + { + "epoch": 0.9645288820557874, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5504558086395264, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7108640074729919, + "num_tokens": 227276939.0, + "step": 8783 + }, + { + "epoch": 0.9646386997584011, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1852118968963623, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6949553489685059, + "num_tokens": 227305598.0, + "step": 8784 + }, + { + "epoch": 0.9647485174610148, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.47578501701355, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6986352205276489, + "num_tokens": 227329581.0, + "step": 8785 + }, + { + "epoch": 0.9648583351636284, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2932779788970947, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7060432434082031, + "num_tokens": 227354844.0, + "step": 8786 + }, + { + "epoch": 0.964968152866242, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1827540397644043, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6978174448013306, + "num_tokens": 227383559.0, + "step": 8787 + }, + { + "epoch": 0.9650779705688557, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.379542827606201, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7136927843093872, + "num_tokens": 227407240.0, + "step": 8788 + }, + { + "epoch": 0.9651877882714693, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1137144565582275, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.7044720649719238, + "num_tokens": 227438972.0, + "step": 8789 + }, + { + "epoch": 0.965297605974083, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2327518463134766, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.701903223991394, + "num_tokens": 227466755.0, + "step": 8790 + }, + { + "epoch": 0.9654074236766966, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.745802402496338, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7320587635040283, + "num_tokens": 227486042.0, + "step": 8791 + }, + { + "epoch": 0.9655172413793104, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.248915672302246, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7282963991165161, + "num_tokens": 227513569.0, + "step": 8792 + }, + { + "epoch": 0.965627059081924, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6157212257385254, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6974734663963318, + "num_tokens": 227536632.0, + "step": 8793 + }, + { + "epoch": 0.9657368767845377, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.269498348236084, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6864580512046814, + "num_tokens": 227565842.0, + "step": 8794 + }, + { + "epoch": 0.9658466944871513, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1833269596099854, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7010089159011841, + "num_tokens": 227595085.0, + "step": 8795 + }, + { + "epoch": 0.965956512189765, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.350202798843384, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7099318504333496, + "num_tokens": 227619531.0, + "step": 8796 + }, + { + "epoch": 0.9660663298923786, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.42879319190979, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6915259957313538, + "num_tokens": 227643118.0, + "step": 8797 + }, + { + "epoch": 0.9661761475949923, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.289577007293701, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7102494239807129, + "num_tokens": 227668000.0, + "step": 8798 + }, + { + "epoch": 0.966285965297606, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4405746459960938, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7255691289901733, + "num_tokens": 227690524.0, + "step": 8799 + }, + { + "epoch": 0.9663957830002197, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3724472522735596, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7115658521652222, + "num_tokens": 227714620.0, + "step": 8800 + }, + { + "epoch": 0.9665056007028333, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.288435935974121, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7208337187767029, + "num_tokens": 227739464.0, + "step": 8801 + }, + { + "epoch": 0.966615418405447, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3770816326141357, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6979825496673584, + "num_tokens": 227764732.0, + "step": 8802 + }, + { + "epoch": 0.9667252361080606, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.232205867767334, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6802247166633606, + "num_tokens": 227795769.0, + "step": 8803 + }, + { + "epoch": 0.9668350538106742, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.417102336883545, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.712443470954895, + "num_tokens": 227820704.0, + "step": 8804 + }, + { + "epoch": 0.9669448715132879, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.0252113342285156, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6879039406776428, + "num_tokens": 227857518.0, + "step": 8805 + }, + { + "epoch": 0.9670546892159017, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.310925006866455, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7167952060699463, + "num_tokens": 227881946.0, + "step": 8806 + }, + { + "epoch": 0.9671645069185153, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2048473358154297, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6845685243606567, + "num_tokens": 227912468.0, + "step": 8807 + }, + { + "epoch": 0.967274324621129, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.448225736618042, + "learning_rate": 1e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.6771060824394226, + "num_tokens": 227938208.0, + "step": 8808 + }, + { + "epoch": 0.9673841423237426, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4873788356781006, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6925642490386963, + "num_tokens": 227964581.0, + "step": 8809 + }, + { + "epoch": 0.9674939600263562, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2284204959869385, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.7024463415145874, + "num_tokens": 227995051.0, + "step": 8810 + }, + { + "epoch": 0.9676037777289699, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.964094400405884, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6994833946228027, + "num_tokens": 228011822.0, + "step": 8811 + }, + { + "epoch": 0.9677135954315835, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.2405388355255127, + "learning_rate": 1e-06, + "loss": 1.1216, + "mean_token_accuracy": 0.6691172122955322, + "num_tokens": 228041097.0, + "step": 8812 + }, + { + "epoch": 0.9678234131341973, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.077950954437256, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7028731107711792, + "num_tokens": 228073745.0, + "step": 8813 + }, + { + "epoch": 0.9679332308368109, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.431004762649536, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7109980583190918, + "num_tokens": 228096488.0, + "step": 8814 + }, + { + "epoch": 0.9680430485394246, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1994545459747314, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6927837133407593, + "num_tokens": 228126843.0, + "step": 8815 + }, + { + "epoch": 0.9681528662420382, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.5107309818267822, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7208527326583862, + "num_tokens": 228148818.0, + "step": 8816 + }, + { + "epoch": 0.9682626839446519, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4597790241241455, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7264105081558228, + "num_tokens": 228171522.0, + "step": 8817 + }, + { + "epoch": 0.9683725016472655, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.4642951488494873, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6861920356750488, + "num_tokens": 228197254.0, + "step": 8818 + }, + { + "epoch": 0.9684823193498792, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 2.546766757965088, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7006166577339172, + "num_tokens": 228219818.0, + "step": 8819 + }, + { + "epoch": 0.9685921370524928, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5863535404205322, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7146002054214478, + "num_tokens": 228239719.0, + "step": 8820 + }, + { + "epoch": 0.9687019547551066, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4423868656158447, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7229478359222412, + "num_tokens": 228262713.0, + "step": 8821 + }, + { + "epoch": 0.9688117724577202, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2835826873779297, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7048529386520386, + "num_tokens": 228289415.0, + "step": 8822 + }, + { + "epoch": 0.9689215901603339, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.225482940673828, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7033678293228149, + "num_tokens": 228315429.0, + "step": 8823 + }, + { + "epoch": 0.9690314078629475, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3448681831359863, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7389339804649353, + "num_tokens": 228339247.0, + "step": 8824 + }, + { + "epoch": 0.9691412255655611, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3011839389801025, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7201969027519226, + "num_tokens": 228364715.0, + "step": 8825 + }, + { + "epoch": 0.9692510432681748, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.644468307495117, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7240470051765442, + "num_tokens": 228386350.0, + "step": 8826 + }, + { + "epoch": 0.9693608609707884, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1290037631988525, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.7042410373687744, + "num_tokens": 228418504.0, + "step": 8827 + }, + { + "epoch": 0.9694706786734022, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4205493927001953, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.712730884552002, + "num_tokens": 228443214.0, + "step": 8828 + }, + { + "epoch": 0.9695804963760158, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5737481117248535, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7131174206733704, + "num_tokens": 228465758.0, + "step": 8829 + }, + { + "epoch": 0.9696903140786295, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.213435173034668, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.71335768699646, + "num_tokens": 228492918.0, + "step": 8830 + }, + { + "epoch": 0.9698001317812431, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4090826511383057, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6916854381561279, + "num_tokens": 228517881.0, + "step": 8831 + }, + { + "epoch": 0.9699099494838568, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.205191135406494, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7108762264251709, + "num_tokens": 228548073.0, + "step": 8832 + }, + { + "epoch": 0.9700197671864704, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6332015991210938, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.714308500289917, + "num_tokens": 228568379.0, + "step": 8833 + }, + { + "epoch": 0.9701295848890841, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3303425312042236, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7034663558006287, + "num_tokens": 228595878.0, + "step": 8834 + }, + { + "epoch": 0.9702394025916978, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2369954586029053, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6969690918922424, + "num_tokens": 228626734.0, + "step": 8835 + }, + { + "epoch": 0.9703492202943115, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2236764430999756, + "learning_rate": 1e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6869015097618103, + "num_tokens": 228657659.0, + "step": 8836 + }, + { + "epoch": 0.9704590379969251, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.443371534347534, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7074058055877686, + "num_tokens": 228679738.0, + "step": 8837 + }, + { + "epoch": 0.9705688556995388, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.303215742111206, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7075204849243164, + "num_tokens": 228706159.0, + "step": 8838 + }, + { + "epoch": 0.9706786734021524, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.7465004920959473, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7075415849685669, + "num_tokens": 228727542.0, + "step": 8839 + }, + { + "epoch": 0.9707884911047661, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4297711849212646, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6922143697738647, + "num_tokens": 228751295.0, + "step": 8840 + }, + { + "epoch": 0.9708983088073797, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.358644962310791, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6978211402893066, + "num_tokens": 228778417.0, + "step": 8841 + }, + { + "epoch": 0.9710081265099935, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.035616159439087, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6940677165985107, + "num_tokens": 228812038.0, + "step": 8842 + }, + { + "epoch": 0.9711179442126071, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.185587167739868, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7049716114997864, + "num_tokens": 228839443.0, + "step": 8843 + }, + { + "epoch": 0.9712277619152208, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5265891551971436, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7164360284805298, + "num_tokens": 228863670.0, + "step": 8844 + }, + { + "epoch": 0.9713375796178344, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 1.9535856246948242, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7025992274284363, + "num_tokens": 228899048.0, + "step": 8845 + }, + { + "epoch": 0.971447397320448, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.469013214111328, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6805967092514038, + "num_tokens": 228925711.0, + "step": 8846 + }, + { + "epoch": 0.9715572150230617, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5286548137664795, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.709484338760376, + "num_tokens": 228948934.0, + "step": 8847 + }, + { + "epoch": 0.9716670327256753, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2044920921325684, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7094123959541321, + "num_tokens": 228975185.0, + "step": 8848 + }, + { + "epoch": 0.971776850428289, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3025922775268555, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.72564697265625, + "num_tokens": 229001724.0, + "step": 8849 + }, + { + "epoch": 0.9718866681309027, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6435751914978027, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7232120633125305, + "num_tokens": 229023802.0, + "step": 8850 + }, + { + "epoch": 0.9719964858335164, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2470884323120117, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7196122407913208, + "num_tokens": 229053214.0, + "step": 8851 + }, + { + "epoch": 0.97210630353613, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2057032585144043, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7087434530258179, + "num_tokens": 229082556.0, + "step": 8852 + }, + { + "epoch": 0.9722161212387437, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.648169755935669, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6982091665267944, + "num_tokens": 229102678.0, + "step": 8853 + }, + { + "epoch": 0.9723259389413573, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6045923233032227, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7053160071372986, + "num_tokens": 229127059.0, + "step": 8854 + }, + { + "epoch": 0.972435756643971, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.445960521697998, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6975592374801636, + "num_tokens": 229151902.0, + "step": 8855 + }, + { + "epoch": 0.9725455743465846, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4952943325042725, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7161715626716614, + "num_tokens": 229174950.0, + "step": 8856 + }, + { + "epoch": 0.9726553920491984, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5469350814819336, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7144378423690796, + "num_tokens": 229196120.0, + "step": 8857 + }, + { + "epoch": 0.972765209751812, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2801594734191895, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6977373957633972, + "num_tokens": 229223840.0, + "step": 8858 + }, + { + "epoch": 0.9728750274544257, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.692526340484619, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7113534212112427, + "num_tokens": 229243153.0, + "step": 8859 + }, + { + "epoch": 0.9729848451570393, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3964223861694336, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6977354288101196, + "num_tokens": 229270777.0, + "step": 8860 + }, + { + "epoch": 0.973094662859653, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5656580924987793, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7093738317489624, + "num_tokens": 229292593.0, + "step": 8861 + }, + { + "epoch": 0.9732044805622666, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.419593334197998, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7086400389671326, + "num_tokens": 229317275.0, + "step": 8862 + }, + { + "epoch": 0.9733142982648803, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.398550271987915, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6900361776351929, + "num_tokens": 229343841.0, + "step": 8863 + }, + { + "epoch": 0.973424115967494, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.497748613357544, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6920077204704285, + "num_tokens": 229368220.0, + "step": 8864 + }, + { + "epoch": 0.9735339336701077, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6399333477020264, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6891659498214722, + "num_tokens": 229389388.0, + "step": 8865 + }, + { + "epoch": 0.9736437513727213, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1865181922912598, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6946633458137512, + "num_tokens": 229419230.0, + "step": 8866 + }, + { + "epoch": 0.973753569075335, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.180137872695923, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6879233121871948, + "num_tokens": 229448049.0, + "step": 8867 + }, + { + "epoch": 0.9738633867779486, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5267200469970703, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6922990083694458, + "num_tokens": 229472943.0, + "step": 8868 + }, + { + "epoch": 0.9739732044805622, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6246631145477295, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7198156118392944, + "num_tokens": 229492958.0, + "step": 8869 + }, + { + "epoch": 0.9740830221831759, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2372069358825684, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7512421011924744, + "num_tokens": 229518904.0, + "step": 8870 + }, + { + "epoch": 0.9741928398857896, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2651114463806152, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7032274007797241, + "num_tokens": 229546570.0, + "step": 8871 + }, + { + "epoch": 0.9743026575884033, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5588314533233643, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7141816020011902, + "num_tokens": 229567965.0, + "step": 8872 + }, + { + "epoch": 0.9744124752910169, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2321231365203857, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7065684199333191, + "num_tokens": 229596198.0, + "step": 8873 + }, + { + "epoch": 0.9745222929936306, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1775262355804443, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7023557424545288, + "num_tokens": 229626224.0, + "step": 8874 + }, + { + "epoch": 0.9746321106962442, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.333597183227539, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6881685256958008, + "num_tokens": 229652706.0, + "step": 8875 + }, + { + "epoch": 0.9747419283988579, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.227849006652832, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7018084526062012, + "num_tokens": 229679203.0, + "step": 8876 + }, + { + "epoch": 0.9748517461014715, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.540245294570923, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.697827160358429, + "num_tokens": 229702096.0, + "step": 8877 + }, + { + "epoch": 0.9749615638040853, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.72428035736084, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6961473822593689, + "num_tokens": 229721871.0, + "step": 8878 + }, + { + "epoch": 0.9750713815066989, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.523158073425293, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6834220886230469, + "num_tokens": 229745519.0, + "step": 8879 + }, + { + "epoch": 0.9751811992093126, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2656071186065674, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.714620053768158, + "num_tokens": 229771220.0, + "step": 8880 + }, + { + "epoch": 0.9752910169119262, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4352941513061523, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6891322731971741, + "num_tokens": 229795634.0, + "step": 8881 + }, + { + "epoch": 0.9754008346145399, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6424832344055176, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7340005040168762, + "num_tokens": 229815447.0, + "step": 8882 + }, + { + "epoch": 0.9755106523171535, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1114470958709717, + "learning_rate": 1e-06, + "loss": 1.1639, + "mean_token_accuracy": 0.6605726480484009, + "num_tokens": 229846624.0, + "step": 8883 + }, + { + "epoch": 0.9756204700197671, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5043399333953857, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7068055272102356, + "num_tokens": 229870063.0, + "step": 8884 + }, + { + "epoch": 0.9757302877223808, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.256906747817993, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7037217617034912, + "num_tokens": 229898350.0, + "step": 8885 + }, + { + "epoch": 0.9758401054249946, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6159212589263916, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6892222166061401, + "num_tokens": 229921538.0, + "step": 8886 + }, + { + "epoch": 0.9759499231276082, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4580578804016113, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6933212876319885, + "num_tokens": 229945119.0, + "step": 8887 + }, + { + "epoch": 0.9760597408302218, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5650436878204346, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6855868697166443, + "num_tokens": 229968920.0, + "step": 8888 + }, + { + "epoch": 0.9761695585328355, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1090848445892334, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7069724798202515, + "num_tokens": 230003802.0, + "step": 8889 + }, + { + "epoch": 0.9762793762354491, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3205580711364746, + "learning_rate": 1e-06, + "loss": 1.0908, + "mean_token_accuracy": 0.6944432258605957, + "num_tokens": 230030227.0, + "step": 8890 + }, + { + "epoch": 0.9763891939380628, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6166436672210693, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7114585638046265, + "num_tokens": 230053413.0, + "step": 8891 + }, + { + "epoch": 0.9764990116406764, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.600013256072998, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6992074251174927, + "num_tokens": 230076540.0, + "step": 8892 + }, + { + "epoch": 0.9766088293432902, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.352605104446411, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7072551846504211, + "num_tokens": 230102079.0, + "step": 8893 + }, + { + "epoch": 0.9767186470459038, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.313732624053955, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6930923461914062, + "num_tokens": 230129033.0, + "step": 8894 + }, + { + "epoch": 0.9768284647485175, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.228680372238159, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6868261098861694, + "num_tokens": 230158669.0, + "step": 8895 + }, + { + "epoch": 0.9769382824511311, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.621927261352539, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7203000783920288, + "num_tokens": 230178979.0, + "step": 8896 + }, + { + "epoch": 0.9770481001537448, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3453869819641113, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.7096431851387024, + "num_tokens": 230203677.0, + "step": 8897 + }, + { + "epoch": 0.9771579178563584, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.601912260055542, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7359310388565063, + "num_tokens": 230223685.0, + "step": 8898 + }, + { + "epoch": 0.9772677355589721, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5202014446258545, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7242218255996704, + "num_tokens": 230243836.0, + "step": 8899 + }, + { + "epoch": 0.9773775532615858, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4442830085754395, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.689386248588562, + "num_tokens": 230267068.0, + "step": 8900 + }, + { + "epoch": 0.9774873709641995, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3677871227264404, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6936895251274109, + "num_tokens": 230293163.0, + "step": 8901 + }, + { + "epoch": 0.9775971886668131, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1215460300445557, + "learning_rate": 1e-06, + "loss": 1.0962, + "mean_token_accuracy": 0.6811066269874573, + "num_tokens": 230325348.0, + "step": 8902 + }, + { + "epoch": 0.9777070063694268, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.7369062900543213, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7302529215812683, + "num_tokens": 230345616.0, + "step": 8903 + }, + { + "epoch": 0.9778168240720404, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.566823720932007, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.713729977607727, + "num_tokens": 230366751.0, + "step": 8904 + }, + { + "epoch": 0.977926641774654, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2705609798431396, + "learning_rate": 1e-06, + "loss": 1.0952, + "mean_token_accuracy": 0.6821273565292358, + "num_tokens": 230395863.0, + "step": 8905 + }, + { + "epoch": 0.9780364594772677, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.89294171333313, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7214264869689941, + "num_tokens": 230414203.0, + "step": 8906 + }, + { + "epoch": 0.9781462771798815, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4267117977142334, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7104542255401611, + "num_tokens": 230440005.0, + "step": 8907 + }, + { + "epoch": 0.9782560948824951, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.6065151691436768, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6979504823684692, + "num_tokens": 230464148.0, + "step": 8908 + }, + { + "epoch": 0.9783659125851087, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6220240592956543, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7094119787216187, + "num_tokens": 230484594.0, + "step": 8909 + }, + { + "epoch": 0.9784757302877224, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2509584426879883, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7065573334693909, + "num_tokens": 230510260.0, + "step": 8910 + }, + { + "epoch": 0.978585547990336, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1595940589904785, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6926404237747192, + "num_tokens": 230539461.0, + "step": 8911 + }, + { + "epoch": 0.9786953656929497, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.282381772994995, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7037099003791809, + "num_tokens": 230566495.0, + "step": 8912 + }, + { + "epoch": 0.9788051833955633, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 1.9732319116592407, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6892118453979492, + "num_tokens": 230601093.0, + "step": 8913 + }, + { + "epoch": 0.978915001098177, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.112720251083374, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7044239044189453, + "num_tokens": 230635869.0, + "step": 8914 + }, + { + "epoch": 0.9790248188007907, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1871659755706787, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6902537941932678, + "num_tokens": 230663962.0, + "step": 8915 + }, + { + "epoch": 0.9791346365034044, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1350204944610596, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6860070824623108, + "num_tokens": 230696912.0, + "step": 8916 + }, + { + "epoch": 0.979244454206018, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.208622455596924, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6993002891540527, + "num_tokens": 230725370.0, + "step": 8917 + }, + { + "epoch": 0.9793542719086317, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.434600591659546, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7036037445068359, + "num_tokens": 230749997.0, + "step": 8918 + }, + { + "epoch": 0.9794640896112453, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3290162086486816, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7349043488502502, + "num_tokens": 230774837.0, + "step": 8919 + }, + { + "epoch": 0.979573907313859, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4815430641174316, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.719601035118103, + "num_tokens": 230799299.0, + "step": 8920 + }, + { + "epoch": 0.9796837250164726, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 7.022468566894531, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6988258361816406, + "num_tokens": 230822987.0, + "step": 8921 + }, + { + "epoch": 0.9797935427190864, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6969168186187744, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7029677033424377, + "num_tokens": 230843790.0, + "step": 8922 + }, + { + "epoch": 0.9799033604217, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4252407550811768, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7170437574386597, + "num_tokens": 230866674.0, + "step": 8923 + }, + { + "epoch": 0.9800131781243137, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.233703374862671, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.693821370601654, + "num_tokens": 230896171.0, + "step": 8924 + }, + { + "epoch": 0.9801229958269273, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1224091053009033, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6889110803604126, + "num_tokens": 230927449.0, + "step": 8925 + }, + { + "epoch": 0.980232813529541, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5492734909057617, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7119027972221375, + "num_tokens": 230949584.0, + "step": 8926 + }, + { + "epoch": 0.9803426312321546, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5620195865631104, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7192301154136658, + "num_tokens": 230971160.0, + "step": 8927 + }, + { + "epoch": 0.9804524489347682, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.245512008666992, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.708845853805542, + "num_tokens": 230998463.0, + "step": 8928 + }, + { + "epoch": 0.980562266637382, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.187002182006836, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7271139025688171, + "num_tokens": 231027874.0, + "step": 8929 + }, + { + "epoch": 0.9806720843399956, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3137378692626953, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7001407742500305, + "num_tokens": 231054479.0, + "step": 8930 + }, + { + "epoch": 0.9807819020426093, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1619791984558105, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6975917816162109, + "num_tokens": 231084304.0, + "step": 8931 + }, + { + "epoch": 0.9808917197452229, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.324084997177124, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7081891894340515, + "num_tokens": 231109727.0, + "step": 8932 + }, + { + "epoch": 0.9810015374478366, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3407042026519775, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6943357586860657, + "num_tokens": 231138001.0, + "step": 8933 + }, + { + "epoch": 0.9811113551504502, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.551474094390869, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.7029417753219604, + "num_tokens": 231161019.0, + "step": 8934 + }, + { + "epoch": 0.9812211728530639, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5175282955169678, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7170332670211792, + "num_tokens": 231182904.0, + "step": 8935 + }, + { + "epoch": 0.9813309905556776, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.483135223388672, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7101517915725708, + "num_tokens": 231205834.0, + "step": 8936 + }, + { + "epoch": 0.9814408082582913, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 7.107816219329834, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7123149633407593, + "num_tokens": 231229297.0, + "step": 8937 + }, + { + "epoch": 0.9815506259609049, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3520865440368652, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7130200266838074, + "num_tokens": 231253732.0, + "step": 8938 + }, + { + "epoch": 0.9816604436635186, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4854354858398438, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6962511539459229, + "num_tokens": 231281803.0, + "step": 8939 + }, + { + "epoch": 0.9817702613661322, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.15265154838562, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6849091053009033, + "num_tokens": 231312159.0, + "step": 8940 + }, + { + "epoch": 0.9818800790687459, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.250734567642212, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7150631546974182, + "num_tokens": 231339780.0, + "step": 8941 + }, + { + "epoch": 0.9819898967713595, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.397254228591919, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7231272459030151, + "num_tokens": 231365522.0, + "step": 8942 + }, + { + "epoch": 0.9820997144739732, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.309523344039917, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6919316053390503, + "num_tokens": 231392658.0, + "step": 8943 + }, + { + "epoch": 0.9822095321765869, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1340975761413574, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6957626938819885, + "num_tokens": 231421138.0, + "step": 8944 + }, + { + "epoch": 0.9823193498792006, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2050538063049316, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6977643966674805, + "num_tokens": 231449858.0, + "step": 8945 + }, + { + "epoch": 0.9824291675818142, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.507128953933716, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6961243748664856, + "num_tokens": 231471215.0, + "step": 8946 + }, + { + "epoch": 0.9825389852844278, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4346675872802734, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7475491762161255, + "num_tokens": 231493172.0, + "step": 8947 + }, + { + "epoch": 0.9826488029870415, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.436974287033081, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.7094654440879822, + "num_tokens": 231517220.0, + "step": 8948 + }, + { + "epoch": 0.9827586206896551, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2040679454803467, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6901858448982239, + "num_tokens": 231546201.0, + "step": 8949 + }, + { + "epoch": 0.9828684383922688, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.46817684173584, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7148909568786621, + "num_tokens": 231569750.0, + "step": 8950 + }, + { + "epoch": 0.9829782560948825, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6276326179504395, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7009822726249695, + "num_tokens": 231590161.0, + "step": 8951 + }, + { + "epoch": 0.9830880737974962, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1646697521209717, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6996942162513733, + "num_tokens": 231621120.0, + "step": 8952 + }, + { + "epoch": 0.9831978915001098, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.193128824234009, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.701897144317627, + "num_tokens": 231648905.0, + "step": 8953 + }, + { + "epoch": 0.9833077092027235, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4797024726867676, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6856338977813721, + "num_tokens": 231676121.0, + "step": 8954 + }, + { + "epoch": 0.9834175269053371, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2864694595336914, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7085503935813904, + "num_tokens": 231703666.0, + "step": 8955 + }, + { + "epoch": 0.9835273446079508, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4669251441955566, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6755489706993103, + "num_tokens": 231729719.0, + "step": 8956 + }, + { + "epoch": 0.9836371623105644, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4104740619659424, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6922916173934937, + "num_tokens": 231756062.0, + "step": 8957 + }, + { + "epoch": 0.9837469800131782, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1285901069641113, + "learning_rate": 1e-06, + "loss": 1.1397, + "mean_token_accuracy": 0.6650464534759521, + "num_tokens": 231787721.0, + "step": 8958 + }, + { + "epoch": 0.9838567977157918, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.338773250579834, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6945905685424805, + "num_tokens": 231813870.0, + "step": 8959 + }, + { + "epoch": 0.9839666154184055, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2778608798980713, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7063686847686768, + "num_tokens": 231840917.0, + "step": 8960 + }, + { + "epoch": 0.9840764331210191, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.466299533843994, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7147814631462097, + "num_tokens": 231862843.0, + "step": 8961 + }, + { + "epoch": 0.9841862508236328, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.253481388092041, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7116894721984863, + "num_tokens": 231889043.0, + "step": 8962 + }, + { + "epoch": 0.9842960685262464, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.260991096496582, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7011802196502686, + "num_tokens": 231916725.0, + "step": 8963 + }, + { + "epoch": 0.98440588622886, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 7.066314697265625, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7190071940422058, + "num_tokens": 231946104.0, + "step": 8964 + }, + { + "epoch": 0.9845157039314738, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6040406227111816, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7024081945419312, + "num_tokens": 231967785.0, + "step": 8965 + }, + { + "epoch": 0.9846255216340875, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.251556158065796, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7010935544967651, + "num_tokens": 231994741.0, + "step": 8966 + }, + { + "epoch": 0.9847353393367011, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.54612135887146, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6909708380699158, + "num_tokens": 232016422.0, + "step": 8967 + }, + { + "epoch": 0.9848451570393147, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3976247310638428, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.7014676332473755, + "num_tokens": 232040771.0, + "step": 8968 + }, + { + "epoch": 0.9849549747419284, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3349103927612305, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7153611779212952, + "num_tokens": 232066750.0, + "step": 8969 + }, + { + "epoch": 0.985064792444542, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3637208938598633, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.705923855304718, + "num_tokens": 232090449.0, + "step": 8970 + }, + { + "epoch": 0.9851746101471557, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.055215358734131, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6859076023101807, + "num_tokens": 232122063.0, + "step": 8971 + }, + { + "epoch": 0.9852844278497693, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.239161968231201, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7148797512054443, + "num_tokens": 232149959.0, + "step": 8972 + }, + { + "epoch": 0.9853942455523831, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.7046782970428467, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7258177995681763, + "num_tokens": 232169056.0, + "step": 8973 + }, + { + "epoch": 0.9855040632549967, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3287816047668457, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7056314945220947, + "num_tokens": 232194796.0, + "step": 8974 + }, + { + "epoch": 0.9856138809576104, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2143800258636475, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7073034644126892, + "num_tokens": 232224152.0, + "step": 8975 + }, + { + "epoch": 0.985723698660224, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.416192054748535, + "learning_rate": 1e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.6858774423599243, + "num_tokens": 232248823.0, + "step": 8976 + }, + { + "epoch": 0.9858335163628377, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.156067132949829, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6933591365814209, + "num_tokens": 232278960.0, + "step": 8977 + }, + { + "epoch": 0.9859433340654513, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.621920347213745, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.726435124874115, + "num_tokens": 232299002.0, + "step": 8978 + }, + { + "epoch": 0.986053151768065, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2032060623168945, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7258092164993286, + "num_tokens": 232323783.0, + "step": 8979 + }, + { + "epoch": 0.9861629694706787, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3944005966186523, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7256499528884888, + "num_tokens": 232345943.0, + "step": 8980 + }, + { + "epoch": 0.9862727871732924, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.432183027267456, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6979223489761353, + "num_tokens": 232370383.0, + "step": 8981 + }, + { + "epoch": 0.986382604875906, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2108266353607178, + "learning_rate": 1e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6879541873931885, + "num_tokens": 232399776.0, + "step": 8982 + }, + { + "epoch": 0.9864924225785197, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1059482097625732, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7013742327690125, + "num_tokens": 232429891.0, + "step": 8983 + }, + { + "epoch": 0.9866022402811333, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4242942333221436, + "learning_rate": 1e-06, + "loss": 1.1186, + "mean_token_accuracy": 0.6743770241737366, + "num_tokens": 232456867.0, + "step": 8984 + }, + { + "epoch": 0.986712057983747, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.345669984817505, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7286975383758545, + "num_tokens": 232481328.0, + "step": 8985 + }, + { + "epoch": 0.9868218756863606, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2986323833465576, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7132360935211182, + "num_tokens": 232504646.0, + "step": 8986 + }, + { + "epoch": 0.9869316933889744, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1554932594299316, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6959163546562195, + "num_tokens": 232535282.0, + "step": 8987 + }, + { + "epoch": 0.987041511091588, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3637545108795166, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7073578834533691, + "num_tokens": 232561180.0, + "step": 8988 + }, + { + "epoch": 0.9871513287942016, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2586216926574707, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.7056196928024292, + "num_tokens": 232588832.0, + "step": 8989 + }, + { + "epoch": 0.9872611464968153, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.232401132583618, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6928326487541199, + "num_tokens": 232616528.0, + "step": 8990 + }, + { + "epoch": 0.9873709641994289, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.329467296600342, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6977437138557434, + "num_tokens": 232643620.0, + "step": 8991 + }, + { + "epoch": 0.9874807819020426, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 6.929108142852783, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.7049408555030823, + "num_tokens": 232672758.0, + "step": 8992 + }, + { + "epoch": 0.9875905996046562, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.490112781524658, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6999732851982117, + "num_tokens": 232698818.0, + "step": 8993 + }, + { + "epoch": 0.98770041730727, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2850892543792725, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6991101503372192, + "num_tokens": 232723684.0, + "step": 8994 + }, + { + "epoch": 0.9878102350098836, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.339938163757324, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7154569625854492, + "num_tokens": 232748578.0, + "step": 8995 + }, + { + "epoch": 0.9879200527124973, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.230964422225952, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7073161005973816, + "num_tokens": 232777194.0, + "step": 8996 + }, + { + "epoch": 0.9880298704151109, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1757075786590576, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6932225823402405, + "num_tokens": 232806509.0, + "step": 8997 + }, + { + "epoch": 0.9881396881177246, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4928791522979736, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6930087804794312, + "num_tokens": 232829602.0, + "step": 8998 + }, + { + "epoch": 0.9882495058203382, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.6301634311676025, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7063696384429932, + "num_tokens": 232853101.0, + "step": 8999 + }, + { + "epoch": 0.9883593235229519, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.443976879119873, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7132620811462402, + "num_tokens": 232876847.0, + "step": 9000 + }, + { + "epoch": 0.9884691412255655, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.491426706314087, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7259253263473511, + "num_tokens": 232897886.0, + "step": 9001 + }, + { + "epoch": 0.9885789589281793, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3076350688934326, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.691700279712677, + "num_tokens": 232923874.0, + "step": 9002 + }, + { + "epoch": 0.9886887766307929, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 4.4453253746032715, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.706294059753418, + "num_tokens": 232948175.0, + "step": 9003 + }, + { + "epoch": 0.9887985943334066, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.580512762069702, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6945862770080566, + "num_tokens": 232970566.0, + "step": 9004 + }, + { + "epoch": 0.9889084120360202, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.6390650272369385, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6903015375137329, + "num_tokens": 232993237.0, + "step": 9005 + }, + { + "epoch": 0.9890182297386338, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1027653217315674, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.689752995967865, + "num_tokens": 233027525.0, + "step": 9006 + }, + { + "epoch": 0.9891280474412475, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1354403495788574, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6997488737106323, + "num_tokens": 233061174.0, + "step": 9007 + }, + { + "epoch": 0.9892378651438611, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.359914779663086, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6912828683853149, + "num_tokens": 233087169.0, + "step": 9008 + }, + { + "epoch": 0.9893476828464749, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.283346176147461, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6949816942214966, + "num_tokens": 233113169.0, + "step": 9009 + }, + { + "epoch": 0.9894575005490885, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1218972206115723, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7029666304588318, + "num_tokens": 233145830.0, + "step": 9010 + }, + { + "epoch": 0.9895673182517022, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2612712383270264, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6925662159919739, + "num_tokens": 233172642.0, + "step": 9011 + }, + { + "epoch": 0.9896771359543158, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4802637100219727, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7219319343566895, + "num_tokens": 233194751.0, + "step": 9012 + }, + { + "epoch": 0.9897869536569295, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.055522918701172, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.7024460434913635, + "num_tokens": 233225997.0, + "step": 9013 + }, + { + "epoch": 0.9898967713595431, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.255293130874634, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.7029120922088623, + "num_tokens": 233252009.0, + "step": 9014 + }, + { + "epoch": 0.9900065890621568, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2640275955200195, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7196576595306396, + "num_tokens": 233278741.0, + "step": 9015 + }, + { + "epoch": 0.9901164067647705, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5146114826202393, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.714827299118042, + "num_tokens": 233298654.0, + "step": 9016 + }, + { + "epoch": 0.9902262244673842, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2191050052642822, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7060959339141846, + "num_tokens": 233327394.0, + "step": 9017 + }, + { + "epoch": 0.9903360421699978, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3105406761169434, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7069500684738159, + "num_tokens": 233352462.0, + "step": 9018 + }, + { + "epoch": 0.9904458598726115, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5211808681488037, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7125044465065002, + "num_tokens": 233374329.0, + "step": 9019 + }, + { + "epoch": 0.9905556775752251, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2188336849212646, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7139577865600586, + "num_tokens": 233399704.0, + "step": 9020 + }, + { + "epoch": 0.9906654952778388, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5328047275543213, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7005230188369751, + "num_tokens": 233422103.0, + "step": 9021 + }, + { + "epoch": 0.9907753129804524, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.627807140350342, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7052950859069824, + "num_tokens": 233444041.0, + "step": 9022 + }, + { + "epoch": 0.9908851306830662, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.235229015350342, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7124738693237305, + "num_tokens": 233472497.0, + "step": 9023 + }, + { + "epoch": 0.9909949483856798, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.433324098587036, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7034746408462524, + "num_tokens": 233495761.0, + "step": 9024 + }, + { + "epoch": 0.9911047660882935, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.140700340270996, + "learning_rate": 1e-06, + "loss": 1.1031, + "mean_token_accuracy": 0.6764276027679443, + "num_tokens": 233526528.0, + "step": 9025 + }, + { + "epoch": 0.9912145837909071, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4294252395629883, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7083615064620972, + "num_tokens": 233549552.0, + "step": 9026 + }, + { + "epoch": 0.9913244014935207, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3952438831329346, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7052401304244995, + "num_tokens": 233575314.0, + "step": 9027 + }, + { + "epoch": 0.9914342191961344, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.327418088912964, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7009052634239197, + "num_tokens": 233600910.0, + "step": 9028 + }, + { + "epoch": 0.991544036898748, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3994131088256836, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6975284814834595, + "num_tokens": 233624970.0, + "step": 9029 + }, + { + "epoch": 0.9916538546013618, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1299092769622803, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.696686863899231, + "num_tokens": 233656091.0, + "step": 9030 + }, + { + "epoch": 0.9917636723039754, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.45461106300354, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7092612981796265, + "num_tokens": 233678512.0, + "step": 9031 + }, + { + "epoch": 0.9918734900065891, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2832775115966797, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7070745825767517, + "num_tokens": 233704187.0, + "step": 9032 + }, + { + "epoch": 0.9919833077092027, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2832958698272705, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6992801427841187, + "num_tokens": 233731448.0, + "step": 9033 + }, + { + "epoch": 0.9920931254118164, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3937153816223145, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7075848579406738, + "num_tokens": 233755151.0, + "step": 9034 + }, + { + "epoch": 0.99220294311443, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 1.945749044418335, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7098429799079895, + "num_tokens": 233789917.0, + "step": 9035 + }, + { + "epoch": 0.9923127608170437, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.542762279510498, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7099770307540894, + "num_tokens": 233812965.0, + "step": 9036 + }, + { + "epoch": 0.9924225785196573, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.227384328842163, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7048889994621277, + "num_tokens": 233842025.0, + "step": 9037 + }, + { + "epoch": 0.9925323962222711, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.250593423843384, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7156203985214233, + "num_tokens": 233870113.0, + "step": 9038 + }, + { + "epoch": 0.9926422139248847, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.255443572998047, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7052810788154602, + "num_tokens": 233897948.0, + "step": 9039 + }, + { + "epoch": 0.9927520316274984, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3917224407196045, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6901592016220093, + "num_tokens": 233922903.0, + "step": 9040 + }, + { + "epoch": 0.992861849330112, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.31718373298645, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7304344177246094, + "num_tokens": 233949903.0, + "step": 9041 + }, + { + "epoch": 0.9929716670327257, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.2250139713287354, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6982347369194031, + "num_tokens": 233980714.0, + "step": 9042 + }, + { + "epoch": 0.9930814847353393, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.0927975177764893, + "learning_rate": 1e-06, + "loss": 1.1031, + "mean_token_accuracy": 0.6815624237060547, + "num_tokens": 234011634.0, + "step": 9043 + }, + { + "epoch": 0.993191302437953, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.528991222381592, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6840104460716248, + "num_tokens": 234035125.0, + "step": 9044 + }, + { + "epoch": 0.9933011201405667, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4942450523376465, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6967896223068237, + "num_tokens": 234057417.0, + "step": 9045 + }, + { + "epoch": 0.9934109378431804, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.710738182067871, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6922346353530884, + "num_tokens": 234078743.0, + "step": 9046 + }, + { + "epoch": 0.993520755545794, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.28566312789917, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7265661954879761, + "num_tokens": 234105817.0, + "step": 9047 + }, + { + "epoch": 0.9936305732484076, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2293174266815186, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6903365254402161, + "num_tokens": 234132511.0, + "step": 9048 + }, + { + "epoch": 0.9937403909510213, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4836699962615967, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7179449796676636, + "num_tokens": 234155713.0, + "step": 9049 + }, + { + "epoch": 0.9938502086536349, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1124773025512695, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7074417471885681, + "num_tokens": 234184946.0, + "step": 9050 + }, + { + "epoch": 0.9939600263562486, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.5297043323516846, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7089648246765137, + "num_tokens": 234206926.0, + "step": 9051 + }, + { + "epoch": 0.9940698440588623, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.411290407180786, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.711387038230896, + "num_tokens": 234231917.0, + "step": 9052 + }, + { + "epoch": 0.994179661761476, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3693342208862305, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6939070820808411, + "num_tokens": 234257913.0, + "step": 9053 + }, + { + "epoch": 0.9942894794640896, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4813547134399414, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7188925743103027, + "num_tokens": 234281030.0, + "step": 9054 + }, + { + "epoch": 0.9943992971667033, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.408860206604004, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7097205519676208, + "num_tokens": 234304409.0, + "step": 9055 + }, + { + "epoch": 0.9945091148693169, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1468253135681152, + "learning_rate": 1e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.6683571338653564, + "num_tokens": 234336413.0, + "step": 9056 + }, + { + "epoch": 0.9946189325719306, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.298215627670288, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7126570343971252, + "num_tokens": 234360815.0, + "step": 9057 + }, + { + "epoch": 0.9947287502745442, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3419318199157715, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7115594744682312, + "num_tokens": 234389038.0, + "step": 9058 + }, + { + "epoch": 0.994838567977158, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.559088945388794, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6967136859893799, + "num_tokens": 234410875.0, + "step": 9059 + }, + { + "epoch": 0.9949483856797716, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.381791353225708, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7179361581802368, + "num_tokens": 234436468.0, + "step": 9060 + }, + { + "epoch": 0.9950582033823853, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3064818382263184, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7076660394668579, + "num_tokens": 234462935.0, + "step": 9061 + }, + { + "epoch": 0.9951680210849989, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.171638250350952, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7075977921485901, + "num_tokens": 234496078.0, + "step": 9062 + }, + { + "epoch": 0.9952778387876126, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.30674147605896, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6865049004554749, + "num_tokens": 234524273.0, + "step": 9063 + }, + { + "epoch": 0.9953876564902262, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.3579092025756836, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7145012617111206, + "num_tokens": 234549620.0, + "step": 9064 + }, + { + "epoch": 0.9954974741928398, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1739635467529297, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7142953872680664, + "num_tokens": 234576957.0, + "step": 9065 + }, + { + "epoch": 0.9956072918954535, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.42020845413208, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6960461139678955, + "num_tokens": 234601466.0, + "step": 9066 + }, + { + "epoch": 0.9957171095980673, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.149177312850952, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7261321544647217, + "num_tokens": 234632198.0, + "step": 9067 + }, + { + "epoch": 0.9958269273006809, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.0594310760498047, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.709503173828125, + "num_tokens": 234662027.0, + "step": 9068 + }, + { + "epoch": 0.9959367450032945, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 8.406801223754883, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7092369794845581, + "num_tokens": 234688572.0, + "step": 9069 + }, + { + "epoch": 0.9960465627059082, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.343568801879883, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.684266209602356, + "num_tokens": 234716877.0, + "step": 9070 + }, + { + "epoch": 0.9961563804085218, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.15694522857666, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6979539394378662, + "num_tokens": 234746332.0, + "step": 9071 + }, + { + "epoch": 0.9962661981111355, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4351983070373535, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7098046541213989, + "num_tokens": 234769472.0, + "step": 9072 + }, + { + "epoch": 0.9963760158137491, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3313002586364746, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6994326710700989, + "num_tokens": 234794215.0, + "step": 9073 + }, + { + "epoch": 0.9964858335163629, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1125450134277344, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7104803919792175, + "num_tokens": 234821761.0, + "step": 9074 + }, + { + "epoch": 0.9965956512189765, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.286006450653076, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7131674289703369, + "num_tokens": 234848284.0, + "step": 9075 + }, + { + "epoch": 0.9967054689215902, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.870445966720581, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7118096351623535, + "num_tokens": 234866900.0, + "step": 9076 + }, + { + "epoch": 0.9968152866242038, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.4430999755859375, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7162609696388245, + "num_tokens": 234890696.0, + "step": 9077 + }, + { + "epoch": 0.9969251043268175, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.7765450477600098, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7133727073669434, + "num_tokens": 234909194.0, + "step": 9078 + }, + { + "epoch": 0.9970349220294311, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.1678457260131836, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6967155337333679, + "num_tokens": 234938499.0, + "step": 9079 + }, + { + "epoch": 0.9971447397320448, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 2.8402750492095947, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.702266275882721, + "num_tokens": 234956871.0, + "step": 9080 + }, + { + "epoch": 0.9972545574346585, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3060483932495117, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7213133573532104, + "num_tokens": 234983142.0, + "step": 9081 + }, + { + "epoch": 0.9973643751372722, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.023460865020752, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6988402605056763, + "num_tokens": 235015256.0, + "step": 9082 + }, + { + "epoch": 0.9974741928398858, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.2606539726257324, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7243582606315613, + "num_tokens": 235039088.0, + "step": 9083 + }, + { + "epoch": 0.9975840105424995, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.6537938117980957, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7000753879547119, + "num_tokens": 235059458.0, + "step": 9084 + }, + { + "epoch": 0.9976938282451131, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4869003295898438, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7169030904769897, + "num_tokens": 235080777.0, + "step": 9085 + }, + { + "epoch": 0.9978036459477267, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2282493114471436, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7270292043685913, + "num_tokens": 235106971.0, + "step": 9086 + }, + { + "epoch": 0.9979134636503404, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.724226713180542, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7127454280853271, + "num_tokens": 235126459.0, + "step": 9087 + }, + { + "epoch": 0.9980232813529542, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1711807250976562, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7031784057617188, + "num_tokens": 235154795.0, + "step": 9088 + }, + { + "epoch": 0.9981330990555678, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5986528396606445, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6983823776245117, + "num_tokens": 235177941.0, + "step": 9089 + }, + { + "epoch": 0.9982429167581814, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2339205741882324, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6972286701202393, + "num_tokens": 235205466.0, + "step": 9090 + }, + { + "epoch": 0.9983527344607951, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.57521653175354, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6991537809371948, + "num_tokens": 235228890.0, + "step": 9091 + }, + { + "epoch": 0.9984625521634087, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.52742862701416, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7200466990470886, + "num_tokens": 235250592.0, + "step": 9092 + }, + { + "epoch": 0.9985723698660224, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1846587657928467, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7149169445037842, + "num_tokens": 235278580.0, + "step": 9093 + }, + { + "epoch": 0.998682187568636, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.468865156173706, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7193654775619507, + "num_tokens": 235301987.0, + "step": 9094 + }, + { + "epoch": 0.9987920052712497, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.457138776779175, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6937974691390991, + "num_tokens": 235328062.0, + "step": 9095 + }, + { + "epoch": 0.9989018229738634, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4930765628814697, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7258721590042114, + "num_tokens": 235350562.0, + "step": 9096 + }, + { + "epoch": 0.9990116406764771, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1267025470733643, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7028627395629883, + "num_tokens": 235379626.0, + "step": 9097 + }, + { + "epoch": 0.9991214583790907, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3733627796173096, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7082792520523071, + "num_tokens": 235405941.0, + "step": 9098 + }, + { + "epoch": 0.9992312760817044, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 4.536092281341553, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7063605189323425, + "num_tokens": 235432729.0, + "step": 9099 + }, + { + "epoch": 0.999341093784318, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2621943950653076, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6983256936073303, + "num_tokens": 235463599.0, + "step": 9100 + }, + { + "epoch": 0.9994509114869317, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.471137046813965, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7049005031585693, + "num_tokens": 235486921.0, + "step": 9101 + }, + { + "epoch": 0.9995607291895453, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.1188242435455322, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6888991594314575, + "num_tokens": 235519812.0, + "step": 9102 + }, + { + "epoch": 0.9996705468921591, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.408568859100342, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.723665177822113, + "num_tokens": 235543456.0, + "step": 9103 + }, + { + "epoch": 0.9997803645947727, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.229921817779541, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6906306147575378, + "num_tokens": 235572016.0, + "step": 9104 + }, + { + "epoch": 0.9998901822973864, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.509629726409912, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7148764133453369, + "num_tokens": 235594912.0, + "step": 9105 + }, + { + "epoch": 1.0, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.425065040588379, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7048968076705933, + "num_tokens": 235617245.0, + "step": 9106 + }, + { + "epoch": 1.0001098177026138, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.1304922103881836, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6963909268379211, + "num_tokens": 235647042.0, + "step": 9107 + }, + { + "epoch": 1.0002196354052273, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2042925357818604, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7112014889717102, + "num_tokens": 235675620.0, + "step": 9108 + }, + { + "epoch": 1.000329453107841, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5104923248291016, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7068029642105103, + "num_tokens": 235699190.0, + "step": 9109 + }, + { + "epoch": 1.0004392708104546, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.152587413787842, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7097840905189514, + "num_tokens": 235729568.0, + "step": 9110 + }, + { + "epoch": 1.0005490885130683, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4294443130493164, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7192785143852234, + "num_tokens": 235753975.0, + "step": 9111 + }, + { + "epoch": 1.0006589062156819, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.494170904159546, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7213751673698425, + "num_tokens": 235774114.0, + "step": 9112 + }, + { + "epoch": 1.0007687239182956, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3819096088409424, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6879594326019287, + "num_tokens": 235799505.0, + "step": 9113 + }, + { + "epoch": 1.0008785416209094, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.319197416305542, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7114100456237793, + "num_tokens": 235828000.0, + "step": 9114 + }, + { + "epoch": 1.000988359323523, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4977033138275146, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7393728494644165, + "num_tokens": 235851026.0, + "step": 9115 + }, + { + "epoch": 1.0010981770261367, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.31469464302063, + "learning_rate": 1e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6797242760658264, + "num_tokens": 235880806.0, + "step": 9116 + }, + { + "epoch": 1.0012079947287502, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.217494487762451, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.697029173374176, + "num_tokens": 235913442.0, + "step": 9117 + }, + { + "epoch": 1.001317812431364, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.500178337097168, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7237012982368469, + "num_tokens": 235936145.0, + "step": 9118 + }, + { + "epoch": 1.0014276301339775, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.7313907146453857, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7162393927574158, + "num_tokens": 235958222.0, + "step": 9119 + }, + { + "epoch": 1.0015374478365913, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.6674575805664062, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7209911942481995, + "num_tokens": 235980934.0, + "step": 9120 + }, + { + "epoch": 1.001647265539205, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.465904712677002, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7009027004241943, + "num_tokens": 236005418.0, + "step": 9121 + }, + { + "epoch": 1.0017570832418186, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.370095729827881, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7189339995384216, + "num_tokens": 236031486.0, + "step": 9122 + }, + { + "epoch": 1.0018669009444323, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4807543754577637, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6897206902503967, + "num_tokens": 236058222.0, + "step": 9123 + }, + { + "epoch": 1.0019767186470458, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5110373497009277, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7258645296096802, + "num_tokens": 236082865.0, + "step": 9124 + }, + { + "epoch": 1.0020865363496596, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.0718507766723633, + "learning_rate": 1e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.6902808547019958, + "num_tokens": 236118019.0, + "step": 9125 + }, + { + "epoch": 1.0021963540522731, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.631972551345825, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.711368978023529, + "num_tokens": 236140259.0, + "step": 9126 + }, + { + "epoch": 1.002306171754887, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.143285036087036, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6990960836410522, + "num_tokens": 236171193.0, + "step": 9127 + }, + { + "epoch": 1.0024159894575007, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3605079650878906, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.709272027015686, + "num_tokens": 236198553.0, + "step": 9128 + }, + { + "epoch": 1.0025258071601142, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5855984687805176, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6928942203521729, + "num_tokens": 236225340.0, + "step": 9129 + }, + { + "epoch": 1.002635624862728, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4709644317626953, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7170908451080322, + "num_tokens": 236249433.0, + "step": 9130 + }, + { + "epoch": 1.0027454425653415, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.258685350418091, + "learning_rate": 1e-06, + "loss": 1.0886, + "mean_token_accuracy": 0.6821720004081726, + "num_tokens": 236278592.0, + "step": 9131 + }, + { + "epoch": 1.0028552602679552, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.8002493381500244, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7282763719558716, + "num_tokens": 236296550.0, + "step": 9132 + }, + { + "epoch": 1.0029650779705688, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2626733779907227, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.707520604133606, + "num_tokens": 236325169.0, + "step": 9133 + }, + { + "epoch": 1.0030748956731825, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.186610460281372, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6889649629592896, + "num_tokens": 236355195.0, + "step": 9134 + }, + { + "epoch": 1.0031847133757963, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.0883846282958984, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6908674240112305, + "num_tokens": 236392651.0, + "step": 9135 + }, + { + "epoch": 1.0032945310784098, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.0544397830963135, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.703680157661438, + "num_tokens": 236429123.0, + "step": 9136 + }, + { + "epoch": 1.0034043487810236, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3042263984680176, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7172802090644836, + "num_tokens": 236458505.0, + "step": 9137 + }, + { + "epoch": 1.0035141664836371, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.430109739303589, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.714576244354248, + "num_tokens": 236483365.0, + "step": 9138 + }, + { + "epoch": 1.0036239841862509, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3892900943756104, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7143787145614624, + "num_tokens": 236507918.0, + "step": 9139 + }, + { + "epoch": 1.0037338018888644, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.9784806966781616, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6983595490455627, + "num_tokens": 236542471.0, + "step": 9140 + }, + { + "epoch": 1.0038436195914782, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.490748167037964, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7194474935531616, + "num_tokens": 236566275.0, + "step": 9141 + }, + { + "epoch": 1.0039534372940917, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.888078212738037, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7199149131774902, + "num_tokens": 236588724.0, + "step": 9142 + }, + { + "epoch": 1.0040632549967055, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.2034435272216797, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6880842447280884, + "num_tokens": 236619468.0, + "step": 9143 + }, + { + "epoch": 1.0041730726993192, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.643613576889038, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.707595944404602, + "num_tokens": 236641105.0, + "step": 9144 + }, + { + "epoch": 1.0042828904019327, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.6111061573028564, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.715825617313385, + "num_tokens": 236664417.0, + "step": 9145 + }, + { + "epoch": 1.0043927081045465, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.625709295272827, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7174546718597412, + "num_tokens": 236685394.0, + "step": 9146 + }, + { + "epoch": 1.00450252580716, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.5151498317718506, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7300574779510498, + "num_tokens": 236709099.0, + "step": 9147 + }, + { + "epoch": 1.0046123435097738, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.235447406768799, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6961049437522888, + "num_tokens": 236736328.0, + "step": 9148 + }, + { + "epoch": 1.0047221612123873, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.367496967315674, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.717197597026825, + "num_tokens": 236762359.0, + "step": 9149 + }, + { + "epoch": 1.004831978915001, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.2821364402770996, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6946747303009033, + "num_tokens": 236788664.0, + "step": 9150 + }, + { + "epoch": 1.0049417966176148, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.413900375366211, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7199741005897522, + "num_tokens": 236811440.0, + "step": 9151 + }, + { + "epoch": 1.0050516143202284, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.138082265853882, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6939272284507751, + "num_tokens": 236844805.0, + "step": 9152 + }, + { + "epoch": 1.0051614320228421, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.3241395950317383, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.722302258014679, + "num_tokens": 236869318.0, + "step": 9153 + }, + { + "epoch": 1.0052712497254557, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.0399692058563232, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.733578622341156, + "num_tokens": 236901039.0, + "step": 9154 + }, + { + "epoch": 1.0053810674280694, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.301872730255127, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6931799054145813, + "num_tokens": 236928639.0, + "step": 9155 + }, + { + "epoch": 1.005490885130683, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.4109408855438232, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6893365979194641, + "num_tokens": 236955654.0, + "step": 9156 + }, + { + "epoch": 1.0056007028332967, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.3807718753814697, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7245135307312012, + "num_tokens": 236980378.0, + "step": 9157 + }, + { + "epoch": 1.0057105205359105, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.3079700469970703, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6975526809692383, + "num_tokens": 237008385.0, + "step": 9158 + }, + { + "epoch": 1.005820338238524, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.303375005722046, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7041700482368469, + "num_tokens": 237034555.0, + "step": 9159 + }, + { + "epoch": 1.0059301559411378, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.2720718383789062, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6993846297264099, + "num_tokens": 237066324.0, + "step": 9160 + }, + { + "epoch": 1.0060399736437513, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.2474145889282227, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6993976831436157, + "num_tokens": 237094989.0, + "step": 9161 + }, + { + "epoch": 1.006149791346365, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 2.4909791946411133, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6899280548095703, + "num_tokens": 237120765.0, + "step": 9162 + }, + { + "epoch": 1.0062596090489786, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.1890292167663574, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7015587091445923, + "num_tokens": 237151511.0, + "step": 9163 + }, + { + "epoch": 1.0063694267515924, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.500720977783203, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7060872316360474, + "num_tokens": 237176336.0, + "step": 9164 + }, + { + "epoch": 1.0064792444542061, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.21600341796875, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6850182414054871, + "num_tokens": 237207576.0, + "step": 9165 + }, + { + "epoch": 1.0065890621568196, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.3034756183624268, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7060376405715942, + "num_tokens": 237235137.0, + "step": 9166 + }, + { + "epoch": 1.0066988798594334, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.583258867263794, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7190814018249512, + "num_tokens": 237255124.0, + "step": 9167 + }, + { + "epoch": 1.006808697562047, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.5864577293395996, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6898038983345032, + "num_tokens": 237278364.0, + "step": 9168 + }, + { + "epoch": 1.0069185152646607, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.394379138946533, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7243773937225342, + "num_tokens": 237302500.0, + "step": 9169 + }, + { + "epoch": 1.0070283329672742, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.3443729877471924, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.723682701587677, + "num_tokens": 237327908.0, + "step": 9170 + }, + { + "epoch": 1.007138150669888, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.4187533855438232, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.699592113494873, + "num_tokens": 237355523.0, + "step": 9171 + }, + { + "epoch": 1.0072479683725017, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.3050525188446045, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7046988010406494, + "num_tokens": 237381925.0, + "step": 9172 + }, + { + "epoch": 1.0073577860751153, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.3691205978393555, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7072535753250122, + "num_tokens": 237408436.0, + "step": 9173 + }, + { + "epoch": 1.007467603777729, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.368726968765259, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.713087797164917, + "num_tokens": 237434340.0, + "step": 9174 + }, + { + "epoch": 1.0075774214803426, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.2859315872192383, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7339956164360046, + "num_tokens": 237459680.0, + "step": 9175 + }, + { + "epoch": 1.0076872391829563, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.164419651031494, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6857430338859558, + "num_tokens": 237492224.0, + "step": 9176 + }, + { + "epoch": 1.0077970568855699, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 2.3028485774993896, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7379275560379028, + "num_tokens": 237516302.0, + "step": 9177 + }, + { + "epoch": 1.0079068745881836, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.437774181365967, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7055494785308838, + "num_tokens": 237540954.0, + "step": 9178 + }, + { + "epoch": 1.0080166922907974, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.4236693382263184, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.706836998462677, + "num_tokens": 237565679.0, + "step": 9179 + }, + { + "epoch": 1.008126509993411, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.2075417041778564, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6984232068061829, + "num_tokens": 237592971.0, + "step": 9180 + }, + { + "epoch": 1.0082363276960247, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 2.411769151687622, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7143804430961609, + "num_tokens": 237618973.0, + "step": 9181 + }, + { + "epoch": 1.0083461453986382, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.2301580905914307, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7020576596260071, + "num_tokens": 237649304.0, + "step": 9182 + }, + { + "epoch": 1.008455963101252, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.50929856300354, + "learning_rate": 1e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.6734788417816162, + "num_tokens": 237676775.0, + "step": 9183 + }, + { + "epoch": 1.0085657808038655, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.119565486907959, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6985652446746826, + "num_tokens": 237707669.0, + "step": 9184 + }, + { + "epoch": 1.0086755985064793, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.466297149658203, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7251692414283752, + "num_tokens": 237732335.0, + "step": 9185 + }, + { + "epoch": 1.008785416209093, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.5408055782318115, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7089695334434509, + "num_tokens": 237755742.0, + "step": 9186 + }, + { + "epoch": 1.0088952339117065, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 7.027556896209717, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7124106287956238, + "num_tokens": 237779404.0, + "step": 9187 + }, + { + "epoch": 1.0090050516143203, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.565187931060791, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7209728360176086, + "num_tokens": 237801424.0, + "step": 9188 + }, + { + "epoch": 1.0091148693169338, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.600435972213745, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7093427181243896, + "num_tokens": 237824471.0, + "step": 9189 + }, + { + "epoch": 1.0092246870195476, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.1202054023742676, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6851312518119812, + "num_tokens": 237856973.0, + "step": 9190 + }, + { + "epoch": 1.0093345047221611, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.255143880844116, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7033318877220154, + "num_tokens": 237886112.0, + "step": 9191 + }, + { + "epoch": 1.0094443224247749, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.435764789581299, + "learning_rate": 1e-06, + "loss": 1.115, + "mean_token_accuracy": 0.6794037222862244, + "num_tokens": 237912563.0, + "step": 9192 + }, + { + "epoch": 1.0095541401273886, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.6699841022491455, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7218270897865295, + "num_tokens": 237931329.0, + "step": 9193 + }, + { + "epoch": 1.0096639578300022, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.969453811645508, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7185389399528503, + "num_tokens": 237949637.0, + "step": 9194 + }, + { + "epoch": 1.009773775532616, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.797067403793335, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7338582277297974, + "num_tokens": 237968165.0, + "step": 9195 + }, + { + "epoch": 1.0098835932352295, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.4422237873077393, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7009437680244446, + "num_tokens": 237993937.0, + "step": 9196 + }, + { + "epoch": 1.0099934109378432, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.4139859676361084, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7152734994888306, + "num_tokens": 238017976.0, + "step": 9197 + }, + { + "epoch": 1.0101032286404568, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.1780638694763184, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6895114183425903, + "num_tokens": 238049863.0, + "step": 9198 + }, + { + "epoch": 1.0102130463430705, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.237208604812622, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.722494900226593, + "num_tokens": 238079151.0, + "step": 9199 + }, + { + "epoch": 1.010322864045684, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.3850252628326416, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7205427885055542, + "num_tokens": 238103439.0, + "step": 9200 + }, + { + "epoch": 1.0104326817482978, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.310070514678955, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6985230445861816, + "num_tokens": 238132132.0, + "step": 9201 + }, + { + "epoch": 1.0105424994509116, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.6820485591888428, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7276060581207275, + "num_tokens": 238152749.0, + "step": 9202 + }, + { + "epoch": 1.010652317153525, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.2247016429901123, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7218427658081055, + "num_tokens": 238180616.0, + "step": 9203 + }, + { + "epoch": 1.0107621348561389, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.4887702465057373, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7052810788154602, + "num_tokens": 238205218.0, + "step": 9204 + }, + { + "epoch": 1.0108719525587524, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.5072033405303955, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7094771862030029, + "num_tokens": 238228805.0, + "step": 9205 + }, + { + "epoch": 1.0109817702613662, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.308302640914917, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.697094202041626, + "num_tokens": 238258713.0, + "step": 9206 + }, + { + "epoch": 1.0110915879639797, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.6495108604431152, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7482446432113647, + "num_tokens": 238277651.0, + "step": 9207 + }, + { + "epoch": 1.0112014056665934, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2828965187072754, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6925684213638306, + "num_tokens": 238306098.0, + "step": 9208 + }, + { + "epoch": 1.0113112233692072, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5934364795684814, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7265299558639526, + "num_tokens": 238327071.0, + "step": 9209 + }, + { + "epoch": 1.0114210410718207, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.3524887561798096, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6979108452796936, + "num_tokens": 238353322.0, + "step": 9210 + }, + { + "epoch": 1.0115308587744345, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.1761040687561035, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7158392667770386, + "num_tokens": 238382018.0, + "step": 9211 + }, + { + "epoch": 1.011640676477048, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.638373613357544, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7253125905990601, + "num_tokens": 238401017.0, + "step": 9212 + }, + { + "epoch": 1.0117504941796618, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3822927474975586, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7103586792945862, + "num_tokens": 238428954.0, + "step": 9213 + }, + { + "epoch": 1.0118603118822753, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.3429465293884277, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7302603721618652, + "num_tokens": 238455499.0, + "step": 9214 + }, + { + "epoch": 1.011970129584889, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2791800498962402, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7099483609199524, + "num_tokens": 238481219.0, + "step": 9215 + }, + { + "epoch": 1.0120799472875028, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.3233470916748047, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7311932444572449, + "num_tokens": 238506894.0, + "step": 9216 + }, + { + "epoch": 1.0121897649901164, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.379887104034424, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7156100273132324, + "num_tokens": 238534255.0, + "step": 9217 + }, + { + "epoch": 1.0122995826927301, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6341238021850586, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7176709771156311, + "num_tokens": 238555269.0, + "step": 9218 + }, + { + "epoch": 1.0124094003953437, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1054229736328125, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6996380090713501, + "num_tokens": 238588256.0, + "step": 9219 + }, + { + "epoch": 1.0125192180979574, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.990943193435669, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6973556876182556, + "num_tokens": 238622800.0, + "step": 9220 + }, + { + "epoch": 1.012629035800571, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.6283812522888184, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7209047079086304, + "num_tokens": 238645351.0, + "step": 9221 + }, + { + "epoch": 1.0127388535031847, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5460381507873535, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6981523036956787, + "num_tokens": 238668786.0, + "step": 9222 + }, + { + "epoch": 1.0128486712057985, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.277808427810669, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7210511565208435, + "num_tokens": 238694597.0, + "step": 9223 + }, + { + "epoch": 1.012958488908412, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.390089273452759, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.722292423248291, + "num_tokens": 238719112.0, + "step": 9224 + }, + { + "epoch": 1.0130683066110258, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 1.9980530738830566, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7149527072906494, + "num_tokens": 238751808.0, + "step": 9225 + }, + { + "epoch": 1.0131781243136393, + "ewc_loss": 1.52587890625e-05, + "grad_norm": 2.492968797683716, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7175395488739014, + "num_tokens": 238776031.0, + "step": 9226 + }, + { + "epoch": 1.013287942016253, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6301286220550537, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7068741321563721, + "num_tokens": 238796867.0, + "step": 9227 + }, + { + "epoch": 1.0133977597188666, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 2.4186036586761475, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7028199434280396, + "num_tokens": 238822315.0, + "step": 9228 + }, + { + "epoch": 1.0135075774214803, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.395831346511841, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7039402723312378, + "num_tokens": 238847142.0, + "step": 9229 + }, + { + "epoch": 1.013617395124094, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.285409927368164, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7029522657394409, + "num_tokens": 238877845.0, + "step": 9230 + }, + { + "epoch": 1.0137272128267076, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3201704025268555, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6911072134971619, + "num_tokens": 238904482.0, + "step": 9231 + }, + { + "epoch": 1.0138370305293214, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.489429473876953, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6986685991287231, + "num_tokens": 238928874.0, + "step": 9232 + }, + { + "epoch": 1.013946848231935, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5307798385620117, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6997875571250916, + "num_tokens": 238955467.0, + "step": 9233 + }, + { + "epoch": 1.0140566659345487, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5140554904937744, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7301197052001953, + "num_tokens": 238977613.0, + "step": 9234 + }, + { + "epoch": 1.0141664836371622, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.497649669647217, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6993941068649292, + "num_tokens": 239002498.0, + "step": 9235 + }, + { + "epoch": 1.014276301339776, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.0667457580566406, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6968857645988464, + "num_tokens": 239038191.0, + "step": 9236 + }, + { + "epoch": 1.0143861190423897, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2695889472961426, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6835340261459351, + "num_tokens": 239067476.0, + "step": 9237 + }, + { + "epoch": 1.0144959367450033, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3158822059631348, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7148642539978027, + "num_tokens": 239094249.0, + "step": 9238 + }, + { + "epoch": 1.014605754447617, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.382901430130005, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7082875967025757, + "num_tokens": 239119765.0, + "step": 9239 + }, + { + "epoch": 1.0147155721502306, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.222316265106201, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7075421810150146, + "num_tokens": 239147445.0, + "step": 9240 + }, + { + "epoch": 1.0148253898528443, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6074633598327637, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.721609890460968, + "num_tokens": 239169710.0, + "step": 9241 + }, + { + "epoch": 1.0149352075554579, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5438051223754883, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7247363328933716, + "num_tokens": 239191689.0, + "step": 9242 + }, + { + "epoch": 1.0150450252580716, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2476108074188232, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6914421319961548, + "num_tokens": 239220203.0, + "step": 9243 + }, + { + "epoch": 1.0151548429606854, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3636345863342285, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7335243225097656, + "num_tokens": 239247003.0, + "step": 9244 + }, + { + "epoch": 1.015264660663299, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.605147361755371, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7279841899871826, + "num_tokens": 239266673.0, + "step": 9245 + }, + { + "epoch": 1.0153744783659127, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3538713455200195, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7018910646438599, + "num_tokens": 239291722.0, + "step": 9246 + }, + { + "epoch": 1.0154842960685262, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6198890209198, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7152343392372131, + "num_tokens": 239315226.0, + "step": 9247 + }, + { + "epoch": 1.01559411377114, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5473554134368896, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7205893993377686, + "num_tokens": 239341129.0, + "step": 9248 + }, + { + "epoch": 1.0157039314737535, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.359084129333496, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7101112008094788, + "num_tokens": 239366717.0, + "step": 9249 + }, + { + "epoch": 1.0158137491763672, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.35652494430542, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7075788378715515, + "num_tokens": 239392989.0, + "step": 9250 + }, + { + "epoch": 1.015923566878981, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3208467960357666, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.698560357093811, + "num_tokens": 239424703.0, + "step": 9251 + }, + { + "epoch": 1.0160333845815945, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.312088966369629, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6856150031089783, + "num_tokens": 239454364.0, + "step": 9252 + }, + { + "epoch": 1.0161432022842083, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3654634952545166, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6874954104423523, + "num_tokens": 239482975.0, + "step": 9253 + }, + { + "epoch": 1.0162530199868218, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2192559242248535, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6878191232681274, + "num_tokens": 239515619.0, + "step": 9254 + }, + { + "epoch": 1.0163628376894356, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.531646490097046, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.7003448009490967, + "num_tokens": 239537698.0, + "step": 9255 + }, + { + "epoch": 1.0164726553920491, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4962246417999268, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7153409123420715, + "num_tokens": 239562114.0, + "step": 9256 + }, + { + "epoch": 1.0165824730946629, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.225724458694458, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6919612884521484, + "num_tokens": 239590906.0, + "step": 9257 + }, + { + "epoch": 1.0166922907972764, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.31734561920166, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7277939319610596, + "num_tokens": 239615166.0, + "step": 9258 + }, + { + "epoch": 1.0168021084998902, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.293999671936035, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7053124904632568, + "num_tokens": 239641273.0, + "step": 9259 + }, + { + "epoch": 1.016911926202504, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2724766731262207, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7042205333709717, + "num_tokens": 239669932.0, + "step": 9260 + }, + { + "epoch": 1.0170217439051175, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.59517502784729, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7157986760139465, + "num_tokens": 239691431.0, + "step": 9261 + }, + { + "epoch": 1.0171315616077312, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.7197182178497314, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7044625282287598, + "num_tokens": 239714438.0, + "step": 9262 + }, + { + "epoch": 1.0172413793103448, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.27950382232666, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.710376501083374, + "num_tokens": 239741908.0, + "step": 9263 + }, + { + "epoch": 1.0173511970129585, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2809746265411377, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.70514976978302, + "num_tokens": 239772040.0, + "step": 9264 + }, + { + "epoch": 1.017461014715572, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3239011764526367, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7278664708137512, + "num_tokens": 239798973.0, + "step": 9265 + }, + { + "epoch": 1.0175708324181858, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.344590663909912, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7212275266647339, + "num_tokens": 239826141.0, + "step": 9266 + }, + { + "epoch": 1.0176806501207996, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3815250396728516, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6968059539794922, + "num_tokens": 239852736.0, + "step": 9267 + }, + { + "epoch": 1.017790467823413, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.727768898010254, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7294998168945312, + "num_tokens": 239872701.0, + "step": 9268 + }, + { + "epoch": 1.0179002855260268, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1623454093933105, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6972259283065796, + "num_tokens": 239904145.0, + "step": 9269 + }, + { + "epoch": 1.0180101032286404, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4059879779815674, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7408064603805542, + "num_tokens": 239928765.0, + "step": 9270 + }, + { + "epoch": 1.0181199209312541, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.438565254211426, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6895638108253479, + "num_tokens": 239953182.0, + "step": 9271 + }, + { + "epoch": 1.0182297386338677, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6306145191192627, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7103265523910522, + "num_tokens": 239975609.0, + "step": 9272 + }, + { + "epoch": 1.0183395563364814, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.149367094039917, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7167482376098633, + "num_tokens": 240006532.0, + "step": 9273 + }, + { + "epoch": 1.0184493740390952, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.0262176990509033, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7232933044433594, + "num_tokens": 240037463.0, + "step": 9274 + }, + { + "epoch": 1.0185591917417087, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.194751501083374, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6916416883468628, + "num_tokens": 240068775.0, + "step": 9275 + }, + { + "epoch": 1.0186690094443225, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1689703464508057, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6987383365631104, + "num_tokens": 240098457.0, + "step": 9276 + }, + { + "epoch": 1.018778827146936, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3668582439422607, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6992688179016113, + "num_tokens": 240126114.0, + "step": 9277 + }, + { + "epoch": 1.0188886448495498, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.523648977279663, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7215503454208374, + "num_tokens": 240149000.0, + "step": 9278 + }, + { + "epoch": 1.0189984625521633, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.082846164703369, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6829386949539185, + "num_tokens": 240184417.0, + "step": 9279 + }, + { + "epoch": 1.019108280254777, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.252469062805176, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7015587091445923, + "num_tokens": 240213251.0, + "step": 9280 + }, + { + "epoch": 1.0192180979573908, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.273195266723633, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6828254461288452, + "num_tokens": 240243902.0, + "step": 9281 + }, + { + "epoch": 1.0193279156600044, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.300171375274658, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.712159276008606, + "num_tokens": 240271686.0, + "step": 9282 + }, + { + "epoch": 1.0194377333626181, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.341362237930298, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6945102214813232, + "num_tokens": 240298105.0, + "step": 9283 + }, + { + "epoch": 1.0195475510652316, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.330317974090576, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7151893377304077, + "num_tokens": 240323329.0, + "step": 9284 + }, + { + "epoch": 1.0196573687678454, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4202866554260254, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.705447793006897, + "num_tokens": 240348312.0, + "step": 9285 + }, + { + "epoch": 1.019767186470459, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3540074825286865, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7071837186813354, + "num_tokens": 240373960.0, + "step": 9286 + }, + { + "epoch": 1.0198770041730727, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.640246629714966, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7172920107841492, + "num_tokens": 240395472.0, + "step": 9287 + }, + { + "epoch": 1.0199868218756865, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2513203620910645, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6932190656661987, + "num_tokens": 240427402.0, + "step": 9288 + }, + { + "epoch": 1.0200966395783, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.0285563468933105, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7201414108276367, + "num_tokens": 240461485.0, + "step": 9289 + }, + { + "epoch": 1.0202064572809137, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.398545026779175, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7145943641662598, + "num_tokens": 240488154.0, + "step": 9290 + }, + { + "epoch": 1.0203162749835273, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3886640071868896, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7114864587783813, + "num_tokens": 240515041.0, + "step": 9291 + }, + { + "epoch": 1.020426092686141, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.21353816986084, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7162611484527588, + "num_tokens": 240543212.0, + "step": 9292 + }, + { + "epoch": 1.0205359103887546, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1711409091949463, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.720168948173523, + "num_tokens": 240577429.0, + "step": 9293 + }, + { + "epoch": 1.0206457280913683, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3496556282043457, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7055535316467285, + "num_tokens": 240606022.0, + "step": 9294 + }, + { + "epoch": 1.020755545793982, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.593445301055908, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7029179334640503, + "num_tokens": 240627360.0, + "step": 9295 + }, + { + "epoch": 1.0208653634965956, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2329108715057373, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6995627880096436, + "num_tokens": 240655953.0, + "step": 9296 + }, + { + "epoch": 1.0209751811992094, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.337737560272217, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7118340134620667, + "num_tokens": 240681901.0, + "step": 9297 + }, + { + "epoch": 1.021084998901823, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2971177101135254, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.705870509147644, + "num_tokens": 240708896.0, + "step": 9298 + }, + { + "epoch": 1.0211948166044367, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.378767490386963, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7239924669265747, + "num_tokens": 240734080.0, + "step": 9299 + }, + { + "epoch": 1.0213046343070502, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.661531448364258, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7026509046554565, + "num_tokens": 240757776.0, + "step": 9300 + }, + { + "epoch": 1.021414452009664, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6476283073425293, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7147930264472961, + "num_tokens": 240779967.0, + "step": 9301 + }, + { + "epoch": 1.0215242697122777, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3532238006591797, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7021949291229248, + "num_tokens": 240811222.0, + "step": 9302 + }, + { + "epoch": 1.0216340874148913, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.286717653274536, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7088367938995361, + "num_tokens": 240839514.0, + "step": 9303 + }, + { + "epoch": 1.021743905117505, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.251926898956299, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6922730207443237, + "num_tokens": 240869492.0, + "step": 9304 + }, + { + "epoch": 1.0218537228201185, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 3.2893292903900146, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6907093524932861, + "num_tokens": 240903330.0, + "step": 9305 + }, + { + "epoch": 1.0219635405227323, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.360459566116333, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7421621680259705, + "num_tokens": 240931670.0, + "step": 9306 + }, + { + "epoch": 1.0220733582253458, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.448533535003662, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7269167900085449, + "num_tokens": 240955765.0, + "step": 9307 + }, + { + "epoch": 1.0221831759279596, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.233720541000366, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7115853428840637, + "num_tokens": 240984698.0, + "step": 9308 + }, + { + "epoch": 1.0222929936305734, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4542396068573, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7083784937858582, + "num_tokens": 241011486.0, + "step": 9309 + }, + { + "epoch": 1.022402811333187, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3924412727355957, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6987431049346924, + "num_tokens": 241038036.0, + "step": 9310 + }, + { + "epoch": 1.0225126290358006, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5790252685546875, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7129320502281189, + "num_tokens": 241062846.0, + "step": 9311 + }, + { + "epoch": 1.0226224467384142, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4084222316741943, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7198867797851562, + "num_tokens": 241089714.0, + "step": 9312 + }, + { + "epoch": 1.022732264441028, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.15837025642395, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.694892406463623, + "num_tokens": 241122272.0, + "step": 9313 + }, + { + "epoch": 1.0228420821436415, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.689639091491699, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7170770168304443, + "num_tokens": 241143105.0, + "step": 9314 + }, + { + "epoch": 1.0229518998462552, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3100643157958984, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6985005140304565, + "num_tokens": 241170861.0, + "step": 9315 + }, + { + "epoch": 1.023061717548869, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5800530910491943, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7161093950271606, + "num_tokens": 241193185.0, + "step": 9316 + }, + { + "epoch": 1.0231715352514825, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2551379203796387, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7141751050949097, + "num_tokens": 241222262.0, + "step": 9317 + }, + { + "epoch": 1.0232813529540963, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.6576080322265625, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7032511234283447, + "num_tokens": 241245067.0, + "step": 9318 + }, + { + "epoch": 1.0233911706567098, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.3165619373321533, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7091134786605835, + "num_tokens": 241272467.0, + "step": 9319 + }, + { + "epoch": 1.0235009883593236, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.485405921936035, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7065173983573914, + "num_tokens": 241297534.0, + "step": 9320 + }, + { + "epoch": 1.023610806061937, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2143330574035645, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7067655920982361, + "num_tokens": 241327611.0, + "step": 9321 + }, + { + "epoch": 1.0237206237645509, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2946736812591553, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7040512561798096, + "num_tokens": 241356755.0, + "step": 9322 + }, + { + "epoch": 1.0238304414671644, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5462253093719482, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7138715386390686, + "num_tokens": 241378369.0, + "step": 9323 + }, + { + "epoch": 1.0239402591697782, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5916731357574463, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.710018515586853, + "num_tokens": 241404176.0, + "step": 9324 + }, + { + "epoch": 1.024050076872392, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5652246475219727, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7169848680496216, + "num_tokens": 241429475.0, + "step": 9325 + }, + { + "epoch": 1.0241598945750054, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.740623950958252, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7088437676429749, + "num_tokens": 241450642.0, + "step": 9326 + }, + { + "epoch": 1.0242697122776192, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1268818378448486, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7276397347450256, + "num_tokens": 241480796.0, + "step": 9327 + }, + { + "epoch": 1.0243795299802327, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.9856600761413574, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7251571416854858, + "num_tokens": 241498114.0, + "step": 9328 + }, + { + "epoch": 1.0244893476828465, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5652246475219727, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7251157164573669, + "num_tokens": 241519122.0, + "step": 9329 + }, + { + "epoch": 1.02459916538546, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1919867992401123, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7014744281768799, + "num_tokens": 241550097.0, + "step": 9330 + }, + { + "epoch": 1.0247089830880738, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.326106309890747, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6865130662918091, + "num_tokens": 241578237.0, + "step": 9331 + }, + { + "epoch": 1.0248188007906875, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.553104877471924, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7316023111343384, + "num_tokens": 241599538.0, + "step": 9332 + }, + { + "epoch": 1.024928618493301, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.5718345642089844, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7169066071510315, + "num_tokens": 241623433.0, + "step": 9333 + }, + { + "epoch": 1.0250384361959148, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.4411585330963135, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7143548130989075, + "num_tokens": 241647005.0, + "step": 9334 + }, + { + "epoch": 1.0251482538985284, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2390308380126953, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6986165046691895, + "num_tokens": 241676177.0, + "step": 9335 + }, + { + "epoch": 1.0252580716011421, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.233386278152466, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7180295586585999, + "num_tokens": 241705183.0, + "step": 9336 + }, + { + "epoch": 1.0253678893037557, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.631373405456543, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7013038396835327, + "num_tokens": 241728831.0, + "step": 9337 + }, + { + "epoch": 1.0254777070063694, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.1429638862609863, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7233293056488037, + "num_tokens": 241757350.0, + "step": 9338 + }, + { + "epoch": 1.0255875247089832, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.0896060466766357, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7019845247268677, + "num_tokens": 241792208.0, + "step": 9339 + }, + { + "epoch": 1.0256973424115967, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.295588493347168, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7169977426528931, + "num_tokens": 241820190.0, + "step": 9340 + }, + { + "epoch": 1.0258071601142105, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.396825075149536, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7144072651863098, + "num_tokens": 241844640.0, + "step": 9341 + }, + { + "epoch": 1.025916977816824, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.540661573410034, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7259817123413086, + "num_tokens": 241866014.0, + "step": 9342 + }, + { + "epoch": 1.0260267955194378, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.1990091800689697, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.686996579170227, + "num_tokens": 241896370.0, + "step": 9343 + }, + { + "epoch": 1.0261366132220513, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.301781415939331, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6998025178909302, + "num_tokens": 241921445.0, + "step": 9344 + }, + { + "epoch": 1.026246430924665, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.608459949493408, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7201859951019287, + "num_tokens": 241942105.0, + "step": 9345 + }, + { + "epoch": 1.0263562486272788, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.192504405975342, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6930594444274902, + "num_tokens": 241974155.0, + "step": 9346 + }, + { + "epoch": 1.0264660663298923, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2662606239318848, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7082077264785767, + "num_tokens": 242000788.0, + "step": 9347 + }, + { + "epoch": 1.026575884032506, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.237272024154663, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7174507975578308, + "num_tokens": 242028371.0, + "step": 9348 + }, + { + "epoch": 1.0266857017351196, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2302873134613037, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7201046347618103, + "num_tokens": 242056593.0, + "step": 9349 + }, + { + "epoch": 1.0267955194377334, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.248199462890625, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7330501675605774, + "num_tokens": 242083641.0, + "step": 9350 + }, + { + "epoch": 1.026905337140347, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.297578811645508, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7240856885910034, + "num_tokens": 242109187.0, + "step": 9351 + }, + { + "epoch": 1.0270151548429607, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.5732409954071045, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7100642919540405, + "num_tokens": 242131308.0, + "step": 9352 + }, + { + "epoch": 1.0271249725455744, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.439621925354004, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7076747417449951, + "num_tokens": 242158139.0, + "step": 9353 + }, + { + "epoch": 1.027234790248188, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.364621639251709, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7020833492279053, + "num_tokens": 242185517.0, + "step": 9354 + }, + { + "epoch": 1.0273446079508017, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3566689491271973, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7089903354644775, + "num_tokens": 242212944.0, + "step": 9355 + }, + { + "epoch": 1.0274544256534153, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3458878993988037, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7098159790039062, + "num_tokens": 242237921.0, + "step": 9356 + }, + { + "epoch": 1.027564243356029, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3722214698791504, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6997783184051514, + "num_tokens": 242264084.0, + "step": 9357 + }, + { + "epoch": 1.0276740610586426, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.404719114303589, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6892730593681335, + "num_tokens": 242289515.0, + "step": 9358 + }, + { + "epoch": 1.0277838787612563, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.5633392333984375, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7252892255783081, + "num_tokens": 242312676.0, + "step": 9359 + }, + { + "epoch": 1.02789369646387, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2696642875671387, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7009934782981873, + "num_tokens": 242341108.0, + "step": 9360 + }, + { + "epoch": 1.0280035141664836, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.490051507949829, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7095963358879089, + "num_tokens": 242364261.0, + "step": 9361 + }, + { + "epoch": 1.0281133318690974, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.641291618347168, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7138259410858154, + "num_tokens": 242385200.0, + "step": 9362 + }, + { + "epoch": 1.028223149571711, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.13511323928833, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.694330096244812, + "num_tokens": 242421340.0, + "step": 9363 + }, + { + "epoch": 1.0283329672743247, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3233964443206787, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6964755058288574, + "num_tokens": 242449861.0, + "step": 9364 + }, + { + "epoch": 1.0284427849769382, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.349538803100586, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7300890684127808, + "num_tokens": 242476627.0, + "step": 9365 + }, + { + "epoch": 1.028552602679552, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4148495197296143, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7018799781799316, + "num_tokens": 242502244.0, + "step": 9366 + }, + { + "epoch": 1.0286624203821657, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4696431159973145, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7056149244308472, + "num_tokens": 242528568.0, + "step": 9367 + }, + { + "epoch": 1.0287722380847792, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3157317638397217, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7258307933807373, + "num_tokens": 242554618.0, + "step": 9368 + }, + { + "epoch": 1.028882055787393, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4640052318573, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7084888219833374, + "num_tokens": 242576525.0, + "step": 9369 + }, + { + "epoch": 1.0289918734900065, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.341033697128296, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6920535564422607, + "num_tokens": 242604868.0, + "step": 9370 + }, + { + "epoch": 1.0291016911926203, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.562317371368408, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.719758927822113, + "num_tokens": 242627934.0, + "step": 9371 + }, + { + "epoch": 1.0292115088952338, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.654053211212158, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7214465141296387, + "num_tokens": 242648332.0, + "step": 9372 + }, + { + "epoch": 1.0293213265978476, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.323629856109619, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7186483144760132, + "num_tokens": 242675591.0, + "step": 9373 + }, + { + "epoch": 1.0294311443004613, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.5430033206939697, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7047123908996582, + "num_tokens": 242699493.0, + "step": 9374 + }, + { + "epoch": 1.0295409620030749, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3185739517211914, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6764241456985474, + "num_tokens": 242726766.0, + "step": 9375 + }, + { + "epoch": 1.0296507797056886, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.0611653327941895, + "learning_rate": 1e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6870651245117188, + "num_tokens": 242761048.0, + "step": 9376 + }, + { + "epoch": 1.0297605974083022, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.655963659286499, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6935081481933594, + "num_tokens": 242785961.0, + "step": 9377 + }, + { + "epoch": 1.029870415110916, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.314842700958252, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.7037862539291382, + "num_tokens": 242813003.0, + "step": 9378 + }, + { + "epoch": 1.0299802328135295, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.1806633472442627, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6895688772201538, + "num_tokens": 242844550.0, + "step": 9379 + }, + { + "epoch": 1.0300900505161432, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.670433521270752, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7048205137252808, + "num_tokens": 242868020.0, + "step": 9380 + }, + { + "epoch": 1.030199868218757, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.6329538822174072, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.698861837387085, + "num_tokens": 242890243.0, + "step": 9381 + }, + { + "epoch": 1.0303096859213705, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.333364248275757, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7138702869415283, + "num_tokens": 242915586.0, + "step": 9382 + }, + { + "epoch": 1.0304195036239843, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.270129442214966, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7011712193489075, + "num_tokens": 242943042.0, + "step": 9383 + }, + { + "epoch": 1.0305293213265978, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.340651035308838, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7143083214759827, + "num_tokens": 242968983.0, + "step": 9384 + }, + { + "epoch": 1.0306391390292116, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3811426162719727, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7131521701812744, + "num_tokens": 242999537.0, + "step": 9385 + }, + { + "epoch": 1.030748956731825, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2575583457946777, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.7013528347015381, + "num_tokens": 243029698.0, + "step": 9386 + }, + { + "epoch": 1.0308587744344389, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3406054973602295, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7158961296081543, + "num_tokens": 243057058.0, + "step": 9387 + }, + { + "epoch": 1.0309685921370524, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2955996990203857, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7134096622467041, + "num_tokens": 243083376.0, + "step": 9388 + }, + { + "epoch": 1.0310784098396661, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.533507823944092, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7097218632698059, + "num_tokens": 243106844.0, + "step": 9389 + }, + { + "epoch": 1.03118822754228, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.190659999847412, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.7055160403251648, + "num_tokens": 243139308.0, + "step": 9390 + }, + { + "epoch": 1.0312980452448934, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.362178087234497, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7014987468719482, + "num_tokens": 243165509.0, + "step": 9391 + }, + { + "epoch": 1.0314078629475072, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4592127799987793, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.719857931137085, + "num_tokens": 243187638.0, + "step": 9392 + }, + { + "epoch": 1.0315176806501207, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.7753255367279053, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7231305241584778, + "num_tokens": 243207999.0, + "step": 9393 + }, + { + "epoch": 1.0316274983527345, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.32841157913208, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.700035572052002, + "num_tokens": 243236555.0, + "step": 9394 + }, + { + "epoch": 1.031737316055348, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3603036403656006, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7286884784698486, + "num_tokens": 243262821.0, + "step": 9395 + }, + { + "epoch": 1.0318471337579618, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.336562395095825, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7036215662956238, + "num_tokens": 243292156.0, + "step": 9396 + }, + { + "epoch": 1.0319569514605755, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.477752685546875, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7058888673782349, + "num_tokens": 243317821.0, + "step": 9397 + }, + { + "epoch": 1.032066769163189, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3263561725616455, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.699219822883606, + "num_tokens": 243345859.0, + "step": 9398 + }, + { + "epoch": 1.0321765868658028, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.343942403793335, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6987686157226562, + "num_tokens": 243372271.0, + "step": 9399 + }, + { + "epoch": 1.0322864045684164, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3288111686706543, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6884218454360962, + "num_tokens": 243399628.0, + "step": 9400 + }, + { + "epoch": 1.0323962222710301, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.856754779815674, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7311420440673828, + "num_tokens": 243419002.0, + "step": 9401 + }, + { + "epoch": 1.0325060399736437, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3096816539764404, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7023770809173584, + "num_tokens": 243449824.0, + "step": 9402 + }, + { + "epoch": 1.0326158576762574, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.5354793071746826, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.706609845161438, + "num_tokens": 243474777.0, + "step": 9403 + }, + { + "epoch": 1.0327256753788712, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.1439008712768555, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7019698619842529, + "num_tokens": 243508762.0, + "step": 9404 + }, + { + "epoch": 1.0328354930814847, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.774078130722046, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.728601336479187, + "num_tokens": 243528461.0, + "step": 9405 + }, + { + "epoch": 1.0329453107840985, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3305585384368896, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.706570029258728, + "num_tokens": 243556317.0, + "step": 9406 + }, + { + "epoch": 1.033055128486712, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2820990085601807, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.712894082069397, + "num_tokens": 243585250.0, + "step": 9407 + }, + { + "epoch": 1.0331649461893258, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.6932873725891113, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7279048562049866, + "num_tokens": 243606557.0, + "step": 9408 + }, + { + "epoch": 1.0332747638919393, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4192090034484863, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7032034993171692, + "num_tokens": 243632472.0, + "step": 9409 + }, + { + "epoch": 1.033384581594553, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.274120330810547, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7186155319213867, + "num_tokens": 243660567.0, + "step": 9410 + }, + { + "epoch": 1.0334943992971668, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.7808258533477783, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.715674877166748, + "num_tokens": 243681101.0, + "step": 9411 + }, + { + "epoch": 1.0336042169997803, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.434582471847534, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7026046514511108, + "num_tokens": 243710120.0, + "step": 9412 + }, + { + "epoch": 1.033714034702394, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.648221015930176, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7290569543838501, + "num_tokens": 243731564.0, + "step": 9413 + }, + { + "epoch": 1.0338238524050076, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.6211884021759033, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.74223792552948, + "num_tokens": 243752083.0, + "step": 9414 + }, + { + "epoch": 1.0339336701076214, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4498071670532227, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.745595395565033, + "num_tokens": 243776098.0, + "step": 9415 + }, + { + "epoch": 1.034043487810235, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.513406276702881, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7288213968276978, + "num_tokens": 243798831.0, + "step": 9416 + }, + { + "epoch": 1.0341533055128487, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2529070377349854, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7036335468292236, + "num_tokens": 243826736.0, + "step": 9417 + }, + { + "epoch": 1.0342631232154624, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4988858699798584, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7149704694747925, + "num_tokens": 243850406.0, + "step": 9418 + }, + { + "epoch": 1.034372940918076, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.900200843811035, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7403615713119507, + "num_tokens": 243866665.0, + "step": 9419 + }, + { + "epoch": 1.0344827586206897, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4361863136291504, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7169710397720337, + "num_tokens": 243891260.0, + "step": 9420 + }, + { + "epoch": 1.0345925763233033, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.5812768936157227, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7178446054458618, + "num_tokens": 243912863.0, + "step": 9421 + }, + { + "epoch": 1.034702394025917, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.1532161235809326, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7149837017059326, + "num_tokens": 243941500.0, + "step": 9422 + }, + { + "epoch": 1.0348122117285306, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4781315326690674, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6946862936019897, + "num_tokens": 243965604.0, + "step": 9423 + }, + { + "epoch": 1.0349220294311443, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4784581661224365, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7188470363616943, + "num_tokens": 243989569.0, + "step": 9424 + }, + { + "epoch": 1.035031847133758, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2172975540161133, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.722893476486206, + "num_tokens": 244018389.0, + "step": 9425 + }, + { + "epoch": 1.0351416648363716, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4599363803863525, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7123212218284607, + "num_tokens": 244042053.0, + "step": 9426 + }, + { + "epoch": 1.0352514825389854, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.2513248920440674, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7108465433120728, + "num_tokens": 244072302.0, + "step": 9427 + }, + { + "epoch": 1.035361300241599, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.593374729156494, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6993024349212646, + "num_tokens": 244095758.0, + "step": 9428 + }, + { + "epoch": 1.0354711179442126, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3233351707458496, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7136476039886475, + "num_tokens": 244124923.0, + "step": 9429 + }, + { + "epoch": 1.0355809356468262, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.4892144203186035, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7072260975837708, + "num_tokens": 244148357.0, + "step": 9430 + }, + { + "epoch": 1.03569075334944, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.3894083499908447, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7127393484115601, + "num_tokens": 244175301.0, + "step": 9431 + }, + { + "epoch": 1.0358005710520537, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.317218542098999, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7130697965621948, + "num_tokens": 244201386.0, + "step": 9432 + }, + { + "epoch": 1.0359103887546672, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3453617095947266, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7018091082572937, + "num_tokens": 244228633.0, + "step": 9433 + }, + { + "epoch": 1.036020206457281, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.337425947189331, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7063283920288086, + "num_tokens": 244256930.0, + "step": 9434 + }, + { + "epoch": 1.0361300241598945, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4282147884368896, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7396916747093201, + "num_tokens": 244278992.0, + "step": 9435 + }, + { + "epoch": 1.0362398418625083, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.304304599761963, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7099711298942566, + "num_tokens": 244307073.0, + "step": 9436 + }, + { + "epoch": 1.0363496595651218, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4212427139282227, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7081015110015869, + "num_tokens": 244333029.0, + "step": 9437 + }, + { + "epoch": 1.0364594772677356, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 2.6122055053710938, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7323734760284424, + "num_tokens": 244354581.0, + "step": 9438 + }, + { + "epoch": 1.036569294970349, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.6285078525543213, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7079172730445862, + "num_tokens": 244376640.0, + "step": 9439 + }, + { + "epoch": 1.0366791126729629, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.376293420791626, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6900537014007568, + "num_tokens": 244403385.0, + "step": 9440 + }, + { + "epoch": 1.0367889303755766, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3833415508270264, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6995334029197693, + "num_tokens": 244431391.0, + "step": 9441 + }, + { + "epoch": 1.0368987480781902, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3106863498687744, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7025891542434692, + "num_tokens": 244458610.0, + "step": 9442 + }, + { + "epoch": 1.037008565780804, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.1506404876708984, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7000796794891357, + "num_tokens": 244490252.0, + "step": 9443 + }, + { + "epoch": 1.0371183834834174, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3478169441223145, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7214990258216858, + "num_tokens": 244516220.0, + "step": 9444 + }, + { + "epoch": 1.0372282011860312, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4034440517425537, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7122149467468262, + "num_tokens": 244544253.0, + "step": 9445 + }, + { + "epoch": 1.037338018888645, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.5184547901153564, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7124914526939392, + "num_tokens": 244567126.0, + "step": 9446 + }, + { + "epoch": 1.0374478365912585, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.238947868347168, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7125424742698669, + "num_tokens": 244595282.0, + "step": 9447 + }, + { + "epoch": 1.0375576542938723, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.703047752380371, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6939386129379272, + "num_tokens": 244617865.0, + "step": 9448 + }, + { + "epoch": 1.0376674719964858, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.228595495223999, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7091702222824097, + "num_tokens": 244645296.0, + "step": 9449 + }, + { + "epoch": 1.0377772896990995, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.84283185005188, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7310012578964233, + "num_tokens": 244663788.0, + "step": 9450 + }, + { + "epoch": 1.037887107401713, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.604304313659668, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7113978266716003, + "num_tokens": 244685999.0, + "step": 9451 + }, + { + "epoch": 1.0379969251043268, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3048651218414307, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6976000666618347, + "num_tokens": 244713331.0, + "step": 9452 + }, + { + "epoch": 1.0381067428069404, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3022966384887695, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7213967442512512, + "num_tokens": 244740153.0, + "step": 9453 + }, + { + "epoch": 1.0382165605095541, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4995429515838623, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7153860926628113, + "num_tokens": 244763602.0, + "step": 9454 + }, + { + "epoch": 1.0383263782121679, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.136218786239624, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7202966213226318, + "num_tokens": 244793724.0, + "step": 9455 + }, + { + "epoch": 1.0384361959147814, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3508143424987793, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7331623435020447, + "num_tokens": 244818177.0, + "step": 9456 + }, + { + "epoch": 1.0385460136173952, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.131627321243286, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7207729816436768, + "num_tokens": 244848557.0, + "step": 9457 + }, + { + "epoch": 1.0386558313200087, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.392911434173584, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7204903364181519, + "num_tokens": 244874269.0, + "step": 9458 + }, + { + "epoch": 1.0387656490226225, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.6129138469696045, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.719397246837616, + "num_tokens": 244894552.0, + "step": 9459 + }, + { + "epoch": 1.038875466725236, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3269169330596924, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7275069952011108, + "num_tokens": 244919365.0, + "step": 9460 + }, + { + "epoch": 1.0389852844278498, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.421360969543457, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7064715623855591, + "num_tokens": 244943450.0, + "step": 9461 + }, + { + "epoch": 1.0390951021304635, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4711265563964844, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7108041048049927, + "num_tokens": 244966856.0, + "step": 9462 + }, + { + "epoch": 1.039204919833077, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.155909776687622, + "learning_rate": 1e-06, + "loss": 1.0852, + "mean_token_accuracy": 0.6885488033294678, + "num_tokens": 244998059.0, + "step": 9463 + }, + { + "epoch": 1.0393147375356908, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4035749435424805, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.701685905456543, + "num_tokens": 245022171.0, + "step": 9464 + }, + { + "epoch": 1.0394245552383043, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3633058071136475, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7216538786888123, + "num_tokens": 245049263.0, + "step": 9465 + }, + { + "epoch": 1.039534372940918, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4016144275665283, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.732048749923706, + "num_tokens": 245073793.0, + "step": 9466 + }, + { + "epoch": 1.0396441906435316, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.1676058769226074, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7111866474151611, + "num_tokens": 245103029.0, + "step": 9467 + }, + { + "epoch": 1.0397540083461454, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.4623541831970215, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7080979943275452, + "num_tokens": 245128495.0, + "step": 9468 + }, + { + "epoch": 1.0398638260487592, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.310716390609741, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7193425893783569, + "num_tokens": 245156033.0, + "step": 9469 + }, + { + "epoch": 1.0399736437513727, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.7675893306732178, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7296196222305298, + "num_tokens": 245174744.0, + "step": 9470 + }, + { + "epoch": 1.0400834614539864, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.533601760864258, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7042313814163208, + "num_tokens": 245200011.0, + "step": 9471 + }, + { + "epoch": 1.0401932791566, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.284515619277954, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7208986282348633, + "num_tokens": 245227165.0, + "step": 9472 + }, + { + "epoch": 1.0403030968592137, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.257596015930176, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.702603816986084, + "num_tokens": 245257985.0, + "step": 9473 + }, + { + "epoch": 1.0404129145618273, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.5452377796173096, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7430720329284668, + "num_tokens": 245279876.0, + "step": 9474 + }, + { + "epoch": 1.040522732264441, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.498762369155884, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.696695864200592, + "num_tokens": 245305272.0, + "step": 9475 + }, + { + "epoch": 1.0406325499670548, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.5043299198150635, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7073485851287842, + "num_tokens": 245328051.0, + "step": 9476 + }, + { + "epoch": 1.0407423676696683, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.1149580478668213, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7085239887237549, + "num_tokens": 245358157.0, + "step": 9477 + }, + { + "epoch": 1.040852185372282, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.278130054473877, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6998897790908813, + "num_tokens": 245384741.0, + "step": 9478 + }, + { + "epoch": 1.0409620030748956, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.3288793563842773, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.701321005821228, + "num_tokens": 245409463.0, + "step": 9479 + }, + { + "epoch": 1.0410718207775094, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.2590067386627197, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7276005744934082, + "num_tokens": 245438723.0, + "step": 9480 + }, + { + "epoch": 1.041181638480123, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.8859825134277344, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7245962619781494, + "num_tokens": 245458902.0, + "step": 9481 + }, + { + "epoch": 1.0412914561827367, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.2674942016601562, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6984238028526306, + "num_tokens": 245485163.0, + "step": 9482 + }, + { + "epoch": 1.0414012738853504, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.236666202545166, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7230787873268127, + "num_tokens": 245511526.0, + "step": 9483 + }, + { + "epoch": 1.041511091587964, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.599529266357422, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7176647186279297, + "num_tokens": 245533940.0, + "step": 9484 + }, + { + "epoch": 1.0416209092905777, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.6315090656280518, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7428098320960999, + "num_tokens": 245553366.0, + "step": 9485 + }, + { + "epoch": 1.0417307269931912, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.6851136684417725, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7288844585418701, + "num_tokens": 245573331.0, + "step": 9486 + }, + { + "epoch": 1.041840544695805, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.6838951110839844, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7326982617378235, + "num_tokens": 245595441.0, + "step": 9487 + }, + { + "epoch": 1.0419503623984185, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.6170620918273926, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7216987609863281, + "num_tokens": 245616215.0, + "step": 9488 + }, + { + "epoch": 1.0420601801010323, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4161734580993652, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7055555582046509, + "num_tokens": 245641474.0, + "step": 9489 + }, + { + "epoch": 1.042169997803646, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.442814350128174, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7059293985366821, + "num_tokens": 245666626.0, + "step": 9490 + }, + { + "epoch": 1.0422798155062596, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.2846968173980713, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.714151918888092, + "num_tokens": 245693609.0, + "step": 9491 + }, + { + "epoch": 1.0423896332088733, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.556403636932373, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7106509208679199, + "num_tokens": 245715909.0, + "step": 9492 + }, + { + "epoch": 1.0424994509114869, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3982903957366943, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7191688418388367, + "num_tokens": 245743609.0, + "step": 9493 + }, + { + "epoch": 1.0426092686141006, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.7704925537109375, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7133818864822388, + "num_tokens": 245765807.0, + "step": 9494 + }, + { + "epoch": 1.0427190863167142, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.200191020965576, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6934758424758911, + "num_tokens": 245798404.0, + "step": 9495 + }, + { + "epoch": 1.042828904019328, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.5621132850646973, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7069298028945923, + "num_tokens": 245820725.0, + "step": 9496 + }, + { + "epoch": 1.0429387217219417, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.438859224319458, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7037069797515869, + "num_tokens": 245844551.0, + "step": 9497 + }, + { + "epoch": 1.0430485394245552, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.2883620262145996, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7266644239425659, + "num_tokens": 245870890.0, + "step": 9498 + }, + { + "epoch": 1.043158357127169, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.366023540496826, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7027522325515747, + "num_tokens": 245896044.0, + "step": 9499 + }, + { + "epoch": 1.0432681748297825, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4315614700317383, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6923176050186157, + "num_tokens": 245923223.0, + "step": 9500 + }, + { + "epoch": 1.0433779925323963, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3589329719543457, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.7006893754005432, + "num_tokens": 245950175.0, + "step": 9501 + }, + { + "epoch": 1.0434878102350098, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3973681926727295, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7212281823158264, + "num_tokens": 245974500.0, + "step": 9502 + }, + { + "epoch": 1.0435976279376236, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.134700059890747, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6949254870414734, + "num_tokens": 246005891.0, + "step": 9503 + }, + { + "epoch": 1.043707445640237, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.476691961288452, + "learning_rate": 1e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.6822102069854736, + "num_tokens": 246030961.0, + "step": 9504 + }, + { + "epoch": 1.0438172633428509, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4793102741241455, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7022542953491211, + "num_tokens": 246054915.0, + "step": 9505 + }, + { + "epoch": 1.0439270810454646, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3323254585266113, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7205803990364075, + "num_tokens": 246081890.0, + "step": 9506 + }, + { + "epoch": 1.0440368987480781, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.227454900741577, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.713672399520874, + "num_tokens": 246110456.0, + "step": 9507 + }, + { + "epoch": 1.044146716450692, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.1933043003082275, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6917740106582642, + "num_tokens": 246139574.0, + "step": 9508 + }, + { + "epoch": 1.0442565341533054, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4134907722473145, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6971867680549622, + "num_tokens": 246165445.0, + "step": 9509 + }, + { + "epoch": 1.0443663518559192, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.6449601650238037, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7242656946182251, + "num_tokens": 246185870.0, + "step": 9510 + }, + { + "epoch": 1.0444761695585327, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.124868154525757, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.692353367805481, + "num_tokens": 246216822.0, + "step": 9511 + }, + { + "epoch": 1.0445859872611465, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.7483837604522705, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7231965065002441, + "num_tokens": 246239351.0, + "step": 9512 + }, + { + "epoch": 1.0446958049637602, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.689568042755127, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7266916036605835, + "num_tokens": 246262271.0, + "step": 9513 + }, + { + "epoch": 1.0448056226663738, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.473719835281372, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7105715870857239, + "num_tokens": 246286562.0, + "step": 9514 + }, + { + "epoch": 1.0449154403689875, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.508615255355835, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7054241299629211, + "num_tokens": 246310265.0, + "step": 9515 + }, + { + "epoch": 1.045025258071601, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4752368927001953, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6947787404060364, + "num_tokens": 246335195.0, + "step": 9516 + }, + { + "epoch": 1.0451350757742148, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.476102590560913, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7193856239318848, + "num_tokens": 246360099.0, + "step": 9517 + }, + { + "epoch": 1.0452448934768284, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.428966522216797, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6793742775917053, + "num_tokens": 246385857.0, + "step": 9518 + }, + { + "epoch": 1.0453547111794421, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4533164501190186, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7313970327377319, + "num_tokens": 246411918.0, + "step": 9519 + }, + { + "epoch": 1.0454645288820559, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.51334810256958, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7351155281066895, + "num_tokens": 246433859.0, + "step": 9520 + }, + { + "epoch": 1.0455743465846694, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.603362798690796, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7272628545761108, + "num_tokens": 246455155.0, + "step": 9521 + }, + { + "epoch": 1.0456841642872832, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.2332284450531006, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.708063542842865, + "num_tokens": 246482223.0, + "step": 9522 + }, + { + "epoch": 1.0457939819898967, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.408189058303833, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6999502182006836, + "num_tokens": 246507535.0, + "step": 9523 + }, + { + "epoch": 1.0459037996925105, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4792697429656982, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.709730863571167, + "num_tokens": 246530389.0, + "step": 9524 + }, + { + "epoch": 1.046013617395124, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3083629608154297, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7036431431770325, + "num_tokens": 246558410.0, + "step": 9525 + }, + { + "epoch": 1.0461234350977378, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3062353134155273, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7233778238296509, + "num_tokens": 246586971.0, + "step": 9526 + }, + { + "epoch": 1.0462332528003515, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.343021869659424, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7134413719177246, + "num_tokens": 246611921.0, + "step": 9527 + }, + { + "epoch": 1.046343070502965, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4550561904907227, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7137887477874756, + "num_tokens": 246637183.0, + "step": 9528 + }, + { + "epoch": 1.0464528882055788, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3907382488250732, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7032763957977295, + "num_tokens": 246661123.0, + "step": 9529 + }, + { + "epoch": 1.0465627059081923, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.4604344367980957, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7215504050254822, + "num_tokens": 246684376.0, + "step": 9530 + }, + { + "epoch": 1.046672523610806, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.762310028076172, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7189353704452515, + "num_tokens": 246703163.0, + "step": 9531 + }, + { + "epoch": 1.0467823413134196, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.3450751304626465, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7239564061164856, + "num_tokens": 246727436.0, + "step": 9532 + }, + { + "epoch": 1.0468921590160334, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.301722288131714, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7161941528320312, + "num_tokens": 246754963.0, + "step": 9533 + }, + { + "epoch": 1.0470019767186471, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5541446208953857, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7245194911956787, + "num_tokens": 246777203.0, + "step": 9534 + }, + { + "epoch": 1.0471117944212607, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5585238933563232, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7386074662208557, + "num_tokens": 246800178.0, + "step": 9535 + }, + { + "epoch": 1.0472216121238744, + "ewc_loss": 1.5854835510253906e-05, + "grad_norm": 2.327374219894409, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.700284481048584, + "num_tokens": 246827037.0, + "step": 9536 + }, + { + "epoch": 1.047331429826488, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.557887315750122, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7414059042930603, + "num_tokens": 246848866.0, + "step": 9537 + }, + { + "epoch": 1.0474412475291017, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5566272735595703, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.718145489692688, + "num_tokens": 246869487.0, + "step": 9538 + }, + { + "epoch": 1.0475510652317153, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4143946170806885, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6910362243652344, + "num_tokens": 246897217.0, + "step": 9539 + }, + { + "epoch": 1.047660882934329, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.294966220855713, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6970802545547485, + "num_tokens": 246927339.0, + "step": 9540 + }, + { + "epoch": 1.0477707006369428, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.1951661109924316, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7142306566238403, + "num_tokens": 246957855.0, + "step": 9541 + }, + { + "epoch": 1.0478805183395563, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4878599643707275, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7248373031616211, + "num_tokens": 246979698.0, + "step": 9542 + }, + { + "epoch": 1.04799033604217, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2177352905273438, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7346744537353516, + "num_tokens": 247007272.0, + "step": 9543 + }, + { + "epoch": 1.0481001537447836, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.8289241790771484, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.729884147644043, + "num_tokens": 247030032.0, + "step": 9544 + }, + { + "epoch": 1.0482099714473974, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.534440040588379, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7201054096221924, + "num_tokens": 247053901.0, + "step": 9545 + }, + { + "epoch": 1.048319789150011, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5014474391937256, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7055495977401733, + "num_tokens": 247076976.0, + "step": 9546 + }, + { + "epoch": 1.0484296068526247, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.7350966930389404, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7282053232192993, + "num_tokens": 247097690.0, + "step": 9547 + }, + { + "epoch": 1.0485394245552384, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.55098295211792, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7302271127700806, + "num_tokens": 247121076.0, + "step": 9548 + }, + { + "epoch": 1.048649242257852, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.446986675262451, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7240604162216187, + "num_tokens": 247145963.0, + "step": 9549 + }, + { + "epoch": 1.0487590599604657, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4093258380889893, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.704582929611206, + "num_tokens": 247172368.0, + "step": 9550 + }, + { + "epoch": 1.0488688776630792, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.22255802154541, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6920647621154785, + "num_tokens": 247201180.0, + "step": 9551 + }, + { + "epoch": 1.048978695365693, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.1877939701080322, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6993955373764038, + "num_tokens": 247232563.0, + "step": 9552 + }, + { + "epoch": 1.0490885130683065, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3929808139801025, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6901524066925049, + "num_tokens": 247257790.0, + "step": 9553 + }, + { + "epoch": 1.0491983307709203, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.52479887008667, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7121513485908508, + "num_tokens": 247279700.0, + "step": 9554 + }, + { + "epoch": 1.0493081484735338, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5464494228363037, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7293767929077148, + "num_tokens": 247303222.0, + "step": 9555 + }, + { + "epoch": 1.0494179661761476, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5292932987213135, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7301241159439087, + "num_tokens": 247325782.0, + "step": 9556 + }, + { + "epoch": 1.0495277838787613, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3156919479370117, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.689629316329956, + "num_tokens": 247356404.0, + "step": 9557 + }, + { + "epoch": 1.0496376015813749, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.1094861030578613, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6945253014564514, + "num_tokens": 247389537.0, + "step": 9558 + }, + { + "epoch": 1.0497474192839886, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.6231586933135986, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7061949968338013, + "num_tokens": 247412214.0, + "step": 9559 + }, + { + "epoch": 1.0498572369866022, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.582016706466675, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7428375482559204, + "num_tokens": 247432745.0, + "step": 9560 + }, + { + "epoch": 1.049967054689216, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.394216775894165, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7134283781051636, + "num_tokens": 247457197.0, + "step": 9561 + }, + { + "epoch": 1.0500768723918297, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.7539541721343994, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6924194097518921, + "num_tokens": 247479184.0, + "step": 9562 + }, + { + "epoch": 1.0501866900944432, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5935258865356445, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7307692766189575, + "num_tokens": 247502682.0, + "step": 9563 + }, + { + "epoch": 1.050296507797057, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3672938346862793, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6990042924880981, + "num_tokens": 247531199.0, + "step": 9564 + }, + { + "epoch": 1.0504063254996705, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.350740432739258, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6927376389503479, + "num_tokens": 247560239.0, + "step": 9565 + }, + { + "epoch": 1.0505161432022843, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4041051864624023, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6971181631088257, + "num_tokens": 247584598.0, + "step": 9566 + }, + { + "epoch": 1.0506259609048978, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3536336421966553, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7197421789169312, + "num_tokens": 247610185.0, + "step": 9567 + }, + { + "epoch": 1.0507357786075116, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.577704668045044, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7451679706573486, + "num_tokens": 247632298.0, + "step": 9568 + }, + { + "epoch": 1.050845596310125, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.243208646774292, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7251933813095093, + "num_tokens": 247659916.0, + "step": 9569 + }, + { + "epoch": 1.0509554140127388, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2547600269317627, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7191506028175354, + "num_tokens": 247688480.0, + "step": 9570 + }, + { + "epoch": 1.0510652317153526, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.362455129623413, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.707524299621582, + "num_tokens": 247716386.0, + "step": 9571 + }, + { + "epoch": 1.0511750494179661, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.425601005554199, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7067615389823914, + "num_tokens": 247741248.0, + "step": 9572 + }, + { + "epoch": 1.05128486712058, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.436582326889038, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7127737402915955, + "num_tokens": 247767135.0, + "step": 9573 + }, + { + "epoch": 1.0513946848231934, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.278111696243286, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7111169099807739, + "num_tokens": 247793866.0, + "step": 9574 + }, + { + "epoch": 1.0515045025258072, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2213802337646484, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7034447193145752, + "num_tokens": 247826488.0, + "step": 9575 + }, + { + "epoch": 1.0516143202284207, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.564053773880005, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6961283087730408, + "num_tokens": 247848265.0, + "step": 9576 + }, + { + "epoch": 1.0517241379310345, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2065236568450928, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7066787481307983, + "num_tokens": 247878027.0, + "step": 9577 + }, + { + "epoch": 1.0518339556336482, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.7177834510803223, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7221901416778564, + "num_tokens": 247899160.0, + "step": 9578 + }, + { + "epoch": 1.0519437733362618, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4823122024536133, + "learning_rate": 1e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.6767305731773376, + "num_tokens": 247924094.0, + "step": 9579 + }, + { + "epoch": 1.0520535910388755, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.0227434635162354, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6955338716506958, + "num_tokens": 247958200.0, + "step": 9580 + }, + { + "epoch": 1.052163408741489, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.416686773300171, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7092280387878418, + "num_tokens": 247982276.0, + "step": 9581 + }, + { + "epoch": 1.0522732264441028, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.303894519805908, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7162202596664429, + "num_tokens": 248010082.0, + "step": 9582 + }, + { + "epoch": 1.0523830441467164, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4593849182128906, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7258914709091187, + "num_tokens": 248035005.0, + "step": 9583 + }, + { + "epoch": 1.05249286184933, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.291264772415161, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7037190198898315, + "num_tokens": 248061281.0, + "step": 9584 + }, + { + "epoch": 1.0526026795519439, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.327150821685791, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6993135213851929, + "num_tokens": 248089291.0, + "step": 9585 + }, + { + "epoch": 1.0527124972545574, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.161031484603882, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7131792902946472, + "num_tokens": 248118085.0, + "step": 9586 + }, + { + "epoch": 1.0528223149571712, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4693093299865723, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7084792852401733, + "num_tokens": 248141100.0, + "step": 9587 + }, + { + "epoch": 1.0529321326597847, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4392364025115967, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7151732444763184, + "num_tokens": 248165329.0, + "step": 9588 + }, + { + "epoch": 1.0530419503623984, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2442822456359863, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.69426429271698, + "num_tokens": 248196463.0, + "step": 9589 + }, + { + "epoch": 1.053151768065012, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.522172451019287, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7332857847213745, + "num_tokens": 248219419.0, + "step": 9590 + }, + { + "epoch": 1.0532615857676257, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.67134952545166, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6932919025421143, + "num_tokens": 248242837.0, + "step": 9591 + }, + { + "epoch": 1.0533714034702395, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4958341121673584, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7072272300720215, + "num_tokens": 248267252.0, + "step": 9592 + }, + { + "epoch": 1.053481221172853, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.0629637241363525, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6912695169448853, + "num_tokens": 248303356.0, + "step": 9593 + }, + { + "epoch": 1.0535910388754668, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.382401704788208, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6997199058532715, + "num_tokens": 248331023.0, + "step": 9594 + }, + { + "epoch": 1.0537008565780803, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.0878498554229736, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7156041860580444, + "num_tokens": 248360940.0, + "step": 9595 + }, + { + "epoch": 1.053810674280694, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.247068405151367, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7105362415313721, + "num_tokens": 248390650.0, + "step": 9596 + }, + { + "epoch": 1.0539204919833076, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2973523139953613, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7117696404457092, + "num_tokens": 248417494.0, + "step": 9597 + }, + { + "epoch": 1.0540303096859214, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.1468698978424072, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7304952144622803, + "num_tokens": 248444723.0, + "step": 9598 + }, + { + "epoch": 1.0541401273885351, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4553732872009277, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7042452692985535, + "num_tokens": 248466685.0, + "step": 9599 + }, + { + "epoch": 1.0542499450911487, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.7383663654327393, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7200915813446045, + "num_tokens": 248488630.0, + "step": 9600 + }, + { + "epoch": 1.0543597627937624, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.206102132797241, + "learning_rate": 1e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6771665811538696, + "num_tokens": 248518374.0, + "step": 9601 + }, + { + "epoch": 1.054469580496376, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3526415824890137, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7080081701278687, + "num_tokens": 248542665.0, + "step": 9602 + }, + { + "epoch": 1.0545793981989897, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.0737860202789307, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7204618453979492, + "num_tokens": 248576368.0, + "step": 9603 + }, + { + "epoch": 1.0546892159016032, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.1652681827545166, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6885991096496582, + "num_tokens": 248606520.0, + "step": 9604 + }, + { + "epoch": 1.054799033604217, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2571122646331787, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6814092397689819, + "num_tokens": 248634950.0, + "step": 9605 + }, + { + "epoch": 1.0549088513068308, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.269550323486328, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7111415266990662, + "num_tokens": 248663549.0, + "step": 9606 + }, + { + "epoch": 1.0550186690094443, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4763596057891846, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7020969390869141, + "num_tokens": 248688326.0, + "step": 9607 + }, + { + "epoch": 1.055128486712058, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.357635974884033, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7054337859153748, + "num_tokens": 248714511.0, + "step": 9608 + }, + { + "epoch": 1.0552383044146716, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.300684690475464, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7132203578948975, + "num_tokens": 248742304.0, + "step": 9609 + }, + { + "epoch": 1.0553481221172853, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3541266918182373, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7318403720855713, + "num_tokens": 248768175.0, + "step": 9610 + }, + { + "epoch": 1.0554579398198989, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2552895545959473, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.703835129737854, + "num_tokens": 248798423.0, + "step": 9611 + }, + { + "epoch": 1.0555677575225126, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3701694011688232, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7054002285003662, + "num_tokens": 248823545.0, + "step": 9612 + }, + { + "epoch": 1.0556775752251264, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.7461178302764893, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7298606038093567, + "num_tokens": 248842430.0, + "step": 9613 + }, + { + "epoch": 1.05578739292774, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2548093795776367, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7051630020141602, + "num_tokens": 248870096.0, + "step": 9614 + }, + { + "epoch": 1.0558972106303537, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5782124996185303, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7044824361801147, + "num_tokens": 248893581.0, + "step": 9615 + }, + { + "epoch": 1.0560070283329672, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3864083290100098, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7006428241729736, + "num_tokens": 248919001.0, + "step": 9616 + }, + { + "epoch": 1.056116846035581, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2821362018585205, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7030467987060547, + "num_tokens": 248946510.0, + "step": 9617 + }, + { + "epoch": 1.0562266637381945, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.215517044067383, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7034562826156616, + "num_tokens": 248978612.0, + "step": 9618 + }, + { + "epoch": 1.0563364814408083, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.2317652702331543, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6837278604507446, + "num_tokens": 249008011.0, + "step": 9619 + }, + { + "epoch": 1.0564462991434218, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3988752365112305, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7220059633255005, + "num_tokens": 249031806.0, + "step": 9620 + }, + { + "epoch": 1.0565561168460356, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.6289851665496826, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7119959592819214, + "num_tokens": 249052702.0, + "step": 9621 + }, + { + "epoch": 1.0566659345486493, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.103403091430664, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6802500486373901, + "num_tokens": 249087582.0, + "step": 9622 + }, + { + "epoch": 1.0567757522512629, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.608600378036499, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7025476694107056, + "num_tokens": 249111026.0, + "step": 9623 + }, + { + "epoch": 1.0568855699538766, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4827399253845215, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7070229053497314, + "num_tokens": 249133904.0, + "step": 9624 + }, + { + "epoch": 1.0569953876564901, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.8920767307281494, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7116053104400635, + "num_tokens": 249153840.0, + "step": 9625 + }, + { + "epoch": 1.057105205359104, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5549166202545166, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7023927569389343, + "num_tokens": 249177898.0, + "step": 9626 + }, + { + "epoch": 1.0572150230617177, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.598980188369751, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7167538404464722, + "num_tokens": 249199392.0, + "step": 9627 + }, + { + "epoch": 1.0573248407643312, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.304649829864502, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7058142423629761, + "num_tokens": 249227251.0, + "step": 9628 + }, + { + "epoch": 1.057434658466945, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.6165690422058105, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7310826778411865, + "num_tokens": 249248156.0, + "step": 9629 + }, + { + "epoch": 1.0575444761695585, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.7010746002197266, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6890239715576172, + "num_tokens": 249271245.0, + "step": 9630 + }, + { + "epoch": 1.0576542938721722, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.510221481323242, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6889244318008423, + "num_tokens": 249297210.0, + "step": 9631 + }, + { + "epoch": 1.0577641115747858, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.708221197128296, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7239981293678284, + "num_tokens": 249317807.0, + "step": 9632 + }, + { + "epoch": 1.0578739292773995, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.269808769226074, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7114571332931519, + "num_tokens": 249347204.0, + "step": 9633 + }, + { + "epoch": 1.057983746980013, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1880905628204346, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7068297863006592, + "num_tokens": 249377018.0, + "step": 9634 + }, + { + "epoch": 1.0580935646826268, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2974839210510254, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7194496393203735, + "num_tokens": 249404706.0, + "step": 9635 + }, + { + "epoch": 1.0582033823852406, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4953627586364746, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7186832427978516, + "num_tokens": 249428576.0, + "step": 9636 + }, + { + "epoch": 1.0583132000878541, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.408186435699463, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7263909578323364, + "num_tokens": 249455495.0, + "step": 9637 + }, + { + "epoch": 1.0584230177904679, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.647611618041992, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7237678170204163, + "num_tokens": 249474722.0, + "step": 9638 + }, + { + "epoch": 1.0585328354930814, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.410695791244507, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7136319875717163, + "num_tokens": 249501296.0, + "step": 9639 + }, + { + "epoch": 1.0586426531956952, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.8363113403320312, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7280598282814026, + "num_tokens": 249521551.0, + "step": 9640 + }, + { + "epoch": 1.0587524708983087, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.445829153060913, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7193220257759094, + "num_tokens": 249546353.0, + "step": 9641 + }, + { + "epoch": 1.0588622886009225, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.40262508392334, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7024736404418945, + "num_tokens": 249572895.0, + "step": 9642 + }, + { + "epoch": 1.0589721063035362, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4720380306243896, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.732142448425293, + "num_tokens": 249596783.0, + "step": 9643 + }, + { + "epoch": 1.0590819240061498, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.418231725692749, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7087557315826416, + "num_tokens": 249623279.0, + "step": 9644 + }, + { + "epoch": 1.0591917417087635, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4012441635131836, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7069219350814819, + "num_tokens": 249651574.0, + "step": 9645 + }, + { + "epoch": 1.059301559411377, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.396149158477783, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7012335062026978, + "num_tokens": 249677631.0, + "step": 9646 + }, + { + "epoch": 1.0594113771139908, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4368715286254883, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7004848718643188, + "num_tokens": 249703995.0, + "step": 9647 + }, + { + "epoch": 1.0595211948166043, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5081279277801514, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7152345180511475, + "num_tokens": 249728629.0, + "step": 9648 + }, + { + "epoch": 1.059631012519218, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.284074068069458, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6942914724349976, + "num_tokens": 249757537.0, + "step": 9649 + }, + { + "epoch": 1.0597408302218319, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5959298610687256, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7120606303215027, + "num_tokens": 249779905.0, + "step": 9650 + }, + { + "epoch": 1.0598506479244454, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.348909854888916, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7180694341659546, + "num_tokens": 249806155.0, + "step": 9651 + }, + { + "epoch": 1.0599604656270591, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.188533306121826, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.7005516290664673, + "num_tokens": 249836082.0, + "step": 9652 + }, + { + "epoch": 1.0600702833296727, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4707846641540527, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7127282619476318, + "num_tokens": 249860640.0, + "step": 9653 + }, + { + "epoch": 1.0601801010322864, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.4959468841552734, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7040581703186035, + "num_tokens": 249884934.0, + "step": 9654 + }, + { + "epoch": 1.0602899187349, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.452786922454834, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7067391276359558, + "num_tokens": 249910064.0, + "step": 9655 + }, + { + "epoch": 1.0603997364375137, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.3933534622192383, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6953087449073792, + "num_tokens": 249937239.0, + "step": 9656 + }, + { + "epoch": 1.0605095541401275, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 3.0001866817474365, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7360538244247437, + "num_tokens": 249955880.0, + "step": 9657 + }, + { + "epoch": 1.060619371842741, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1813971996307373, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6920239329338074, + "num_tokens": 249988295.0, + "step": 9658 + }, + { + "epoch": 1.0607291895453548, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.502472400665283, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7174124717712402, + "num_tokens": 250014371.0, + "step": 9659 + }, + { + "epoch": 1.0608390072479683, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.5628275871276855, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7253282070159912, + "num_tokens": 250035108.0, + "step": 9660 + }, + { + "epoch": 1.060948824950582, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.585688829421997, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.716271698474884, + "num_tokens": 250057924.0, + "step": 9661 + }, + { + "epoch": 1.0610586426531956, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.218729257583618, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.706119179725647, + "num_tokens": 250088872.0, + "step": 9662 + }, + { + "epoch": 1.0611684603558094, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.689464807510376, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6882973909378052, + "num_tokens": 250110129.0, + "step": 9663 + }, + { + "epoch": 1.0612782780584231, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5551624298095703, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7088680267333984, + "num_tokens": 250132915.0, + "step": 9664 + }, + { + "epoch": 1.0613880957610367, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.409881353378296, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7044225931167603, + "num_tokens": 250158548.0, + "step": 9665 + }, + { + "epoch": 1.0614979134636504, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5252232551574707, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7041381597518921, + "num_tokens": 250183701.0, + "step": 9666 + }, + { + "epoch": 1.061607731166264, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2134785652160645, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.707095742225647, + "num_tokens": 250212131.0, + "step": 9667 + }, + { + "epoch": 1.0617175488688777, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4981210231781006, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7278753519058228, + "num_tokens": 250235131.0, + "step": 9668 + }, + { + "epoch": 1.0618273665714912, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5323679447174072, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7110832929611206, + "num_tokens": 250258718.0, + "step": 9669 + }, + { + "epoch": 1.061937184274105, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3398971557617188, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7360029220581055, + "num_tokens": 250283341.0, + "step": 9670 + }, + { + "epoch": 1.0620470019767188, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5020172595977783, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7309316992759705, + "num_tokens": 250304962.0, + "step": 9671 + }, + { + "epoch": 1.0621568196793323, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.390455722808838, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6942431926727295, + "num_tokens": 250332787.0, + "step": 9672 + }, + { + "epoch": 1.062266637381946, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.0661721229553223, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6838713884353638, + "num_tokens": 250368623.0, + "step": 9673 + }, + { + "epoch": 1.0623764550845596, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.519815683364868, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7135770916938782, + "num_tokens": 250392124.0, + "step": 9674 + }, + { + "epoch": 1.0624862727871733, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3623907566070557, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7136988639831543, + "num_tokens": 250417204.0, + "step": 9675 + }, + { + "epoch": 1.0625960904897869, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3437111377716064, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6986164450645447, + "num_tokens": 250444684.0, + "step": 9676 + }, + { + "epoch": 1.0627059081924006, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.494764804840088, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.697600781917572, + "num_tokens": 250469887.0, + "step": 9677 + }, + { + "epoch": 1.0628157258950144, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.531548500061035, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7168838977813721, + "num_tokens": 250498534.0, + "step": 9678 + }, + { + "epoch": 1.062925543597628, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3473408222198486, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7340853214263916, + "num_tokens": 250525802.0, + "step": 9679 + }, + { + "epoch": 1.0630353613002417, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3117403984069824, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.717876672744751, + "num_tokens": 250553585.0, + "step": 9680 + }, + { + "epoch": 1.0631451790028552, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5703277587890625, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7255353927612305, + "num_tokens": 250576137.0, + "step": 9681 + }, + { + "epoch": 1.063254996705469, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.186292886734009, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6869982481002808, + "num_tokens": 250606474.0, + "step": 9682 + }, + { + "epoch": 1.0633648144080825, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1730926036834717, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7132173180580139, + "num_tokens": 250636724.0, + "step": 9683 + }, + { + "epoch": 1.0634746321106963, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.234508991241455, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7046608328819275, + "num_tokens": 250665159.0, + "step": 9684 + }, + { + "epoch": 1.0635844498133098, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.455443859100342, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7281734943389893, + "num_tokens": 250687852.0, + "step": 9685 + }, + { + "epoch": 1.0636942675159236, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.229849100112915, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7249712347984314, + "num_tokens": 250717529.0, + "step": 9686 + }, + { + "epoch": 1.0638040852185373, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.562035322189331, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7047637701034546, + "num_tokens": 250738915.0, + "step": 9687 + }, + { + "epoch": 1.0639139029211508, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5677547454833984, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7224996089935303, + "num_tokens": 250761103.0, + "step": 9688 + }, + { + "epoch": 1.0640237206237646, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4963536262512207, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7254231572151184, + "num_tokens": 250784246.0, + "step": 9689 + }, + { + "epoch": 1.0641335383263781, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4241161346435547, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7148905992507935, + "num_tokens": 250808922.0, + "step": 9690 + }, + { + "epoch": 1.064243356028992, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.21812105178833, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6929327249526978, + "num_tokens": 250837921.0, + "step": 9691 + }, + { + "epoch": 1.0643531737316057, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.381478786468506, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7150278091430664, + "num_tokens": 250864218.0, + "step": 9692 + }, + { + "epoch": 1.0644629914342192, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3225834369659424, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7260599136352539, + "num_tokens": 250889667.0, + "step": 9693 + }, + { + "epoch": 1.064572809136833, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 2.8419301509857178, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7421663999557495, + "num_tokens": 250906367.0, + "step": 9694 + }, + { + "epoch": 1.0646826268394465, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.281916618347168, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7130283713340759, + "num_tokens": 250935459.0, + "step": 9695 + }, + { + "epoch": 1.0647924445420602, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.4861559867858887, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7181745767593384, + "num_tokens": 250960205.0, + "step": 9696 + }, + { + "epoch": 1.0649022622446738, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3312461376190186, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7088508605957031, + "num_tokens": 250986395.0, + "step": 9697 + }, + { + "epoch": 1.0650120799472875, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5925986766815186, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.7033017873764038, + "num_tokens": 251010206.0, + "step": 9698 + }, + { + "epoch": 1.065121897649901, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.294434070587158, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.703927755355835, + "num_tokens": 251037764.0, + "step": 9699 + }, + { + "epoch": 1.0652317153525148, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3811593055725098, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7146813869476318, + "num_tokens": 251065632.0, + "step": 9700 + }, + { + "epoch": 1.0653415330551286, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.610299825668335, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7000730633735657, + "num_tokens": 251087567.0, + "step": 9701 + }, + { + "epoch": 1.065451350757742, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.839325428009033, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7329224944114685, + "num_tokens": 251105169.0, + "step": 9702 + }, + { + "epoch": 1.0655611684603559, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.315323829650879, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7535458207130432, + "num_tokens": 251129897.0, + "step": 9703 + }, + { + "epoch": 1.0656709861629694, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.661862373352051, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7217118740081787, + "num_tokens": 251152836.0, + "step": 9704 + }, + { + "epoch": 1.0657808038655832, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4586451053619385, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7109304666519165, + "num_tokens": 251178610.0, + "step": 9705 + }, + { + "epoch": 1.0658906215681967, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3290436267852783, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7075133919715881, + "num_tokens": 251203642.0, + "step": 9706 + }, + { + "epoch": 1.0660004392708105, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3840901851654053, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.7109657526016235, + "num_tokens": 251228970.0, + "step": 9707 + }, + { + "epoch": 1.0661102569734242, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2551121711730957, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7142979502677917, + "num_tokens": 251256481.0, + "step": 9708 + }, + { + "epoch": 1.0662200746760377, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.389434814453125, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6908918023109436, + "num_tokens": 251284419.0, + "step": 9709 + }, + { + "epoch": 1.0663298923786515, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.383805751800537, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7100206613540649, + "num_tokens": 251308932.0, + "step": 9710 + }, + { + "epoch": 1.066439710081265, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2034482955932617, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6976299285888672, + "num_tokens": 251339253.0, + "step": 9711 + }, + { + "epoch": 1.0665495277838788, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.772214651107788, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7198477983474731, + "num_tokens": 251360860.0, + "step": 9712 + }, + { + "epoch": 1.0666593454864923, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3807592391967773, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7199549674987793, + "num_tokens": 251385661.0, + "step": 9713 + }, + { + "epoch": 1.066769163189106, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5209157466888428, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7101026773452759, + "num_tokens": 251411174.0, + "step": 9714 + }, + { + "epoch": 1.0668789808917198, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.704962730407715, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7150317430496216, + "num_tokens": 251433780.0, + "step": 9715 + }, + { + "epoch": 1.0669887985943334, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.607879161834717, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7072927951812744, + "num_tokens": 251457958.0, + "step": 9716 + }, + { + "epoch": 1.0670986162969471, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.358299732208252, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7130944728851318, + "num_tokens": 251483937.0, + "step": 9717 + }, + { + "epoch": 1.0672084339995607, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.490478038787842, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7171857357025146, + "num_tokens": 251506303.0, + "step": 9718 + }, + { + "epoch": 1.0673182517021744, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5464508533477783, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7014033794403076, + "num_tokens": 251531605.0, + "step": 9719 + }, + { + "epoch": 1.067428069404788, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.356337070465088, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6847519874572754, + "num_tokens": 251559470.0, + "step": 9720 + }, + { + "epoch": 1.0675378871074017, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6192233562469482, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.738362729549408, + "num_tokens": 251580254.0, + "step": 9721 + }, + { + "epoch": 1.0676477048100155, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.357022285461426, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.707334041595459, + "num_tokens": 251606712.0, + "step": 9722 + }, + { + "epoch": 1.067757522512629, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.500178813934326, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6976284384727478, + "num_tokens": 251632873.0, + "step": 9723 + }, + { + "epoch": 1.0678673402152428, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3261070251464844, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6976228356361389, + "num_tokens": 251662345.0, + "step": 9724 + }, + { + "epoch": 1.0679771579178563, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.0867676734924316, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7145155072212219, + "num_tokens": 251694926.0, + "step": 9725 + }, + { + "epoch": 1.06808697562047, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5634195804595947, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7122399806976318, + "num_tokens": 251717795.0, + "step": 9726 + }, + { + "epoch": 1.0681967933230836, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.308675527572632, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7168018221855164, + "num_tokens": 251744985.0, + "step": 9727 + }, + { + "epoch": 1.0683066110256974, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4358696937561035, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7042398452758789, + "num_tokens": 251769990.0, + "step": 9728 + }, + { + "epoch": 1.068416428728311, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.8280656337738037, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.696213960647583, + "num_tokens": 251793256.0, + "step": 9729 + }, + { + "epoch": 1.0685262464309246, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3172433376312256, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6851849555969238, + "num_tokens": 251824616.0, + "step": 9730 + }, + { + "epoch": 1.0686360641335384, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5631227493286133, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6957873106002808, + "num_tokens": 251849164.0, + "step": 9731 + }, + { + "epoch": 1.068745881836152, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1541824340820312, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7257508635520935, + "num_tokens": 251879989.0, + "step": 9732 + }, + { + "epoch": 1.0688556995387657, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3413116931915283, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7088907957077026, + "num_tokens": 251906896.0, + "step": 9733 + }, + { + "epoch": 1.0689655172413792, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2707021236419678, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7023702263832092, + "num_tokens": 251935270.0, + "step": 9734 + }, + { + "epoch": 1.069075334943993, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.5589439868927, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7201973795890808, + "num_tokens": 251958585.0, + "step": 9735 + }, + { + "epoch": 1.0691851526466065, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.233930826187134, + "learning_rate": 1e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.6778239011764526, + "num_tokens": 251989265.0, + "step": 9736 + }, + { + "epoch": 1.0692949703492203, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2820522785186768, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7316809296607971, + "num_tokens": 252015010.0, + "step": 9737 + }, + { + "epoch": 1.069404788051834, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.508943796157837, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7206201553344727, + "num_tokens": 252038183.0, + "step": 9738 + }, + { + "epoch": 1.0695146057544476, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.597754716873169, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.69759202003479, + "num_tokens": 252061193.0, + "step": 9739 + }, + { + "epoch": 1.0696244234570613, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.222492218017578, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7084778547286987, + "num_tokens": 252091708.0, + "step": 9740 + }, + { + "epoch": 1.0697342411596749, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.7080283164978027, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7315664291381836, + "num_tokens": 252110662.0, + "step": 9741 + }, + { + "epoch": 1.0698440588622886, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3122451305389404, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7136790752410889, + "num_tokens": 252138537.0, + "step": 9742 + }, + { + "epoch": 1.0699538765649024, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3588201999664307, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7092475891113281, + "num_tokens": 252164974.0, + "step": 9743 + }, + { + "epoch": 1.070063694267516, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.3685226440429688, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7148371934890747, + "num_tokens": 252189644.0, + "step": 9744 + }, + { + "epoch": 1.0701735119701297, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4139468669891357, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6898863315582275, + "num_tokens": 252215816.0, + "step": 9745 + }, + { + "epoch": 1.0702833296727432, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2987616062164307, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7102398872375488, + "num_tokens": 252242546.0, + "step": 9746 + }, + { + "epoch": 1.070393147375357, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.806539535522461, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7129305005073547, + "num_tokens": 252261287.0, + "step": 9747 + }, + { + "epoch": 1.0705029650779705, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.46441912651062, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7171428203582764, + "num_tokens": 252288336.0, + "step": 9748 + }, + { + "epoch": 1.0706127827805842, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2476515769958496, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7223792672157288, + "num_tokens": 252315179.0, + "step": 9749 + }, + { + "epoch": 1.0707226004831978, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3294310569763184, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.700916051864624, + "num_tokens": 252341554.0, + "step": 9750 + }, + { + "epoch": 1.0708324181858115, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.388831377029419, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6991044878959656, + "num_tokens": 252369884.0, + "step": 9751 + }, + { + "epoch": 1.0709422358884253, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3314919471740723, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.715125322341919, + "num_tokens": 252395945.0, + "step": 9752 + }, + { + "epoch": 1.0710520535910388, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5255675315856934, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7142266631126404, + "num_tokens": 252421213.0, + "step": 9753 + }, + { + "epoch": 1.0711618712936526, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.597999334335327, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7084623575210571, + "num_tokens": 252444980.0, + "step": 9754 + }, + { + "epoch": 1.0712716889962661, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.465972900390625, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7052470445632935, + "num_tokens": 252472709.0, + "step": 9755 + }, + { + "epoch": 1.0713815066988799, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.121490478515625, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7108337879180908, + "num_tokens": 252502923.0, + "step": 9756 + }, + { + "epoch": 1.0714913244014934, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3381402492523193, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7023934721946716, + "num_tokens": 252530288.0, + "step": 9757 + }, + { + "epoch": 1.0716011421041072, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.270357847213745, + "learning_rate": 1e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.689193844795227, + "num_tokens": 252561438.0, + "step": 9758 + }, + { + "epoch": 1.071710959806721, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2575509548187256, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6864923238754272, + "num_tokens": 252591279.0, + "step": 9759 + }, + { + "epoch": 1.0718207775093345, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1942484378814697, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7164046168327332, + "num_tokens": 252621008.0, + "step": 9760 + }, + { + "epoch": 1.0719305952119482, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3522613048553467, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7059389352798462, + "num_tokens": 252647364.0, + "step": 9761 + }, + { + "epoch": 1.0720404129145618, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5117571353912354, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7151928544044495, + "num_tokens": 252670834.0, + "step": 9762 + }, + { + "epoch": 1.0721502306171755, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.47607159614563, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.697838544845581, + "num_tokens": 252695647.0, + "step": 9763 + }, + { + "epoch": 1.072260048319789, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2093505859375, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7119843363761902, + "num_tokens": 252725425.0, + "step": 9764 + }, + { + "epoch": 1.0723698660224028, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2106637954711914, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6932186484336853, + "num_tokens": 252755765.0, + "step": 9765 + }, + { + "epoch": 1.0724796837250166, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.275195360183716, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.713242769241333, + "num_tokens": 252784490.0, + "step": 9766 + }, + { + "epoch": 1.07258950142763, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2576849460601807, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7173525094985962, + "num_tokens": 252812843.0, + "step": 9767 + }, + { + "epoch": 1.0726993191302439, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.626328468322754, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7134190797805786, + "num_tokens": 252834022.0, + "step": 9768 + }, + { + "epoch": 1.0728091368328574, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.32027268409729, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.712469220161438, + "num_tokens": 252860473.0, + "step": 9769 + }, + { + "epoch": 1.0729189545354711, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.2032477855682373, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6856697201728821, + "num_tokens": 252892540.0, + "step": 9770 + }, + { + "epoch": 1.0730287722380847, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.1435084342956543, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.71282958984375, + "num_tokens": 252925170.0, + "step": 9771 + }, + { + "epoch": 1.0731385899406984, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.7969698905944824, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7177728414535522, + "num_tokens": 252946526.0, + "step": 9772 + }, + { + "epoch": 1.0732484076433122, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.524238348007202, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6930440664291382, + "num_tokens": 252974024.0, + "step": 9773 + }, + { + "epoch": 1.0733582253459257, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 2.555851697921753, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7131295800209045, + "num_tokens": 252997552.0, + "step": 9774 + }, + { + "epoch": 1.0734680430485395, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3000171184539795, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7108657360076904, + "num_tokens": 253024018.0, + "step": 9775 + }, + { + "epoch": 1.073577860751153, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.023636817932129, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7378678321838379, + "num_tokens": 253059439.0, + "step": 9776 + }, + { + "epoch": 1.0736876784537668, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.1164629459381104, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7002338171005249, + "num_tokens": 253092060.0, + "step": 9777 + }, + { + "epoch": 1.0737974961563803, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.464043617248535, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7134072184562683, + "num_tokens": 253116732.0, + "step": 9778 + }, + { + "epoch": 1.073907313858994, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.7201662063598633, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7026587128639221, + "num_tokens": 253137444.0, + "step": 9779 + }, + { + "epoch": 1.0740171315616078, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4727587699890137, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7041026949882507, + "num_tokens": 253163728.0, + "step": 9780 + }, + { + "epoch": 1.0741269492642214, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6670022010803223, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.709634006023407, + "num_tokens": 253185007.0, + "step": 9781 + }, + { + "epoch": 1.0742367669668351, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4229485988616943, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7097799777984619, + "num_tokens": 253210039.0, + "step": 9782 + }, + { + "epoch": 1.0743465846694487, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.508903980255127, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7058920860290527, + "num_tokens": 253233730.0, + "step": 9783 + }, + { + "epoch": 1.0744564023720624, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6511433124542236, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7385128140449524, + "num_tokens": 253253245.0, + "step": 9784 + }, + { + "epoch": 1.074566220074676, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 3.465250015258789, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7342251539230347, + "num_tokens": 253276720.0, + "step": 9785 + }, + { + "epoch": 1.0746760377772897, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.294349193572998, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.7050817012786865, + "num_tokens": 253304939.0, + "step": 9786 + }, + { + "epoch": 1.0747858554799035, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2123122215270996, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6946470141410828, + "num_tokens": 253337224.0, + "step": 9787 + }, + { + "epoch": 1.074895673182517, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5586655139923096, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7106835842132568, + "num_tokens": 253361449.0, + "step": 9788 + }, + { + "epoch": 1.0750054908851308, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.610456705093384, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7247229814529419, + "num_tokens": 253382157.0, + "step": 9789 + }, + { + "epoch": 1.0751153085877443, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.303868293762207, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7008576393127441, + "num_tokens": 253416618.0, + "step": 9790 + }, + { + "epoch": 1.075225126290358, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.634392261505127, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7230249047279358, + "num_tokens": 253437656.0, + "step": 9791 + }, + { + "epoch": 1.0753349439929716, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.272235155105591, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7013165950775146, + "num_tokens": 253465676.0, + "step": 9792 + }, + { + "epoch": 1.0754447616955853, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5540847778320312, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.709781289100647, + "num_tokens": 253488247.0, + "step": 9793 + }, + { + "epoch": 1.075554579398199, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.7540345191955566, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7069893479347229, + "num_tokens": 253509582.0, + "step": 9794 + }, + { + "epoch": 1.0756643971008126, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5925381183624268, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7023352384567261, + "num_tokens": 253533499.0, + "step": 9795 + }, + { + "epoch": 1.0757742148034264, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 32.3342170715332, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7132243514060974, + "num_tokens": 253559942.0, + "step": 9796 + }, + { + "epoch": 1.07588403250604, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4905343055725098, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7167096138000488, + "num_tokens": 253587048.0, + "step": 9797 + }, + { + "epoch": 1.0759938502086537, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6293699741363525, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.705022394657135, + "num_tokens": 253614210.0, + "step": 9798 + }, + { + "epoch": 1.0761036679112672, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2200686931610107, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7096173763275146, + "num_tokens": 253645388.0, + "step": 9799 + }, + { + "epoch": 1.076213485613881, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.68635630607605, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7228178977966309, + "num_tokens": 253666976.0, + "step": 9800 + }, + { + "epoch": 1.0763233033164945, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.232876777648926, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6893219947814941, + "num_tokens": 253695884.0, + "step": 9801 + }, + { + "epoch": 1.0764331210191083, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.0798184871673584, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6913551092147827, + "num_tokens": 253730963.0, + "step": 9802 + }, + { + "epoch": 1.076542938721722, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5387868881225586, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7163059711456299, + "num_tokens": 253754933.0, + "step": 9803 + }, + { + "epoch": 1.0766527564243356, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.596191883087158, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6987517476081848, + "num_tokens": 253782586.0, + "step": 9804 + }, + { + "epoch": 1.0767625741269493, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.1132652759552, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7187892198562622, + "num_tokens": 253816135.0, + "step": 9805 + }, + { + "epoch": 1.0768723918295628, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.35573673248291, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7303650975227356, + "num_tokens": 253840466.0, + "step": 9806 + }, + { + "epoch": 1.0769822095321766, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.350395679473877, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7158808708190918, + "num_tokens": 253866512.0, + "step": 9807 + }, + { + "epoch": 1.0770920272347904, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.446812152862549, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7313908934593201, + "num_tokens": 253890883.0, + "step": 9808 + }, + { + "epoch": 1.077201844937404, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.682063579559326, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7174157500267029, + "num_tokens": 253913297.0, + "step": 9809 + }, + { + "epoch": 1.0773116626400177, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3100368976593018, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7046021223068237, + "num_tokens": 253941360.0, + "step": 9810 + }, + { + "epoch": 1.0774214803426312, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.1726417541503906, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6857995986938477, + "num_tokens": 253971636.0, + "step": 9811 + }, + { + "epoch": 1.077531298045245, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6641929149627686, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7106913924217224, + "num_tokens": 253996545.0, + "step": 9812 + }, + { + "epoch": 1.0776411157478585, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.134469747543335, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6935374736785889, + "num_tokens": 254030517.0, + "step": 9813 + }, + { + "epoch": 1.0777509334504722, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.643336534500122, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7058124542236328, + "num_tokens": 254052332.0, + "step": 9814 + }, + { + "epoch": 1.0778607511530858, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 7.133281707763672, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7119020819664001, + "num_tokens": 254075027.0, + "step": 9815 + }, + { + "epoch": 1.0779705688556995, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.397334575653076, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6971132755279541, + "num_tokens": 254105596.0, + "step": 9816 + }, + { + "epoch": 1.0780803865583133, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5029618740081787, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.709267258644104, + "num_tokens": 254129912.0, + "step": 9817 + }, + { + "epoch": 1.0781902042609268, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.450789213180542, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7305924296379089, + "num_tokens": 254152853.0, + "step": 9818 + }, + { + "epoch": 1.0783000219635406, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3966152667999268, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7096452713012695, + "num_tokens": 254179367.0, + "step": 9819 + }, + { + "epoch": 1.0784098396661541, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3826844692230225, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6938969492912292, + "num_tokens": 254205615.0, + "step": 9820 + }, + { + "epoch": 1.0785196573687679, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.376781702041626, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7224403619766235, + "num_tokens": 254232185.0, + "step": 9821 + }, + { + "epoch": 1.0786294750713814, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.391558885574341, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.698663592338562, + "num_tokens": 254258918.0, + "step": 9822 + }, + { + "epoch": 1.0787392927739952, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.603055000305176, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7253340482711792, + "num_tokens": 254281886.0, + "step": 9823 + }, + { + "epoch": 1.078849110476609, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2692577838897705, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.698601245880127, + "num_tokens": 254312053.0, + "step": 9824 + }, + { + "epoch": 1.0789589281792225, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4362361431121826, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.722693920135498, + "num_tokens": 254335024.0, + "step": 9825 + }, + { + "epoch": 1.0790687458818362, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2893757820129395, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7053983211517334, + "num_tokens": 254361962.0, + "step": 9826 + }, + { + "epoch": 1.0791785635844497, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.0493412017822266, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7061673402786255, + "num_tokens": 254396842.0, + "step": 9827 + }, + { + "epoch": 1.0792883812870635, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4200007915496826, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7067950367927551, + "num_tokens": 254422731.0, + "step": 9828 + }, + { + "epoch": 1.079398198989677, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.33292818069458, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7245087623596191, + "num_tokens": 254448617.0, + "step": 9829 + }, + { + "epoch": 1.0795080166922908, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.737541437149048, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6991716623306274, + "num_tokens": 254468811.0, + "step": 9830 + }, + { + "epoch": 1.0796178343949046, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4809279441833496, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7052441239356995, + "num_tokens": 254492675.0, + "step": 9831 + }, + { + "epoch": 1.079727652097518, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.736009120941162, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7165689468383789, + "num_tokens": 254512410.0, + "step": 9832 + }, + { + "epoch": 1.0798374698001318, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.538569211959839, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7101176977157593, + "num_tokens": 254535716.0, + "step": 9833 + }, + { + "epoch": 1.0799472875027454, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6275148391723633, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7232497334480286, + "num_tokens": 254557037.0, + "step": 9834 + }, + { + "epoch": 1.0800571052053591, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.405806303024292, + "learning_rate": 1e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6855117678642273, + "num_tokens": 254584259.0, + "step": 9835 + }, + { + "epoch": 1.0801669229079727, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4111711978912354, + "learning_rate": 1e-06, + "loss": 1.0945, + "mean_token_accuracy": 0.6736021637916565, + "num_tokens": 254614845.0, + "step": 9836 + }, + { + "epoch": 1.0802767406105864, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.34783935546875, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7197489142417908, + "num_tokens": 254640490.0, + "step": 9837 + }, + { + "epoch": 1.0803865583132002, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2815802097320557, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.713161051273346, + "num_tokens": 254667390.0, + "step": 9838 + }, + { + "epoch": 1.0804963760158137, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.586270332336426, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6979799866676331, + "num_tokens": 254690296.0, + "step": 9839 + }, + { + "epoch": 1.0806061937184275, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6295201778411865, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6962931156158447, + "num_tokens": 254715809.0, + "step": 9840 + }, + { + "epoch": 1.080716011421041, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3687641620635986, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7431236505508423, + "num_tokens": 254741448.0, + "step": 9841 + }, + { + "epoch": 1.0808258291236548, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3535702228546143, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6986685991287231, + "num_tokens": 254767857.0, + "step": 9842 + }, + { + "epoch": 1.0809356468262683, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4594454765319824, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.71529620885849, + "num_tokens": 254792749.0, + "step": 9843 + }, + { + "epoch": 1.081045464528882, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.332166910171509, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6876412034034729, + "num_tokens": 254827509.0, + "step": 9844 + }, + { + "epoch": 1.0811552822314958, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.413005828857422, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7392487525939941, + "num_tokens": 254849998.0, + "step": 9845 + }, + { + "epoch": 1.0812650999341094, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5702126026153564, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6991634368896484, + "num_tokens": 254873692.0, + "step": 9846 + }, + { + "epoch": 1.081374917636723, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2990100383758545, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7207873463630676, + "num_tokens": 254901458.0, + "step": 9847 + }, + { + "epoch": 1.0814847353393366, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.672226905822754, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7107127904891968, + "num_tokens": 254922737.0, + "step": 9848 + }, + { + "epoch": 1.0815945530419504, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.360891580581665, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7008206844329834, + "num_tokens": 254951216.0, + "step": 9849 + }, + { + "epoch": 1.081704370744564, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.549889326095581, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7064425945281982, + "num_tokens": 254975725.0, + "step": 9850 + }, + { + "epoch": 1.0818141884471777, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.0491881370544434, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6933114528656006, + "num_tokens": 255012158.0, + "step": 9851 + }, + { + "epoch": 1.0819240061497912, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.9285240173339844, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7311452627182007, + "num_tokens": 255028102.0, + "step": 9852 + }, + { + "epoch": 1.082033823852405, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.710904359817505, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.721746563911438, + "num_tokens": 255053098.0, + "step": 9853 + }, + { + "epoch": 1.0821436415550187, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.290968179702759, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7429473400115967, + "num_tokens": 255079204.0, + "step": 9854 + }, + { + "epoch": 1.0822534592576323, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.165895462036133, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7173826694488525, + "num_tokens": 255109320.0, + "step": 9855 + }, + { + "epoch": 1.082363276960246, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.147930860519409, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.689662754535675, + "num_tokens": 255142634.0, + "step": 9856 + }, + { + "epoch": 1.0824730946628596, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.49957275390625, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7367714643478394, + "num_tokens": 255166313.0, + "step": 9857 + }, + { + "epoch": 1.0825829123654733, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.385246753692627, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7080717086791992, + "num_tokens": 255194625.0, + "step": 9858 + }, + { + "epoch": 1.082692730068087, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3184585571289062, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7255937457084656, + "num_tokens": 255220850.0, + "step": 9859 + }, + { + "epoch": 1.0828025477707006, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.7036192417144775, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7169300317764282, + "num_tokens": 255243028.0, + "step": 9860 + }, + { + "epoch": 1.0829123654733144, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4292545318603516, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6875700950622559, + "num_tokens": 255272468.0, + "step": 9861 + }, + { + "epoch": 1.083022183175928, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.611898899078369, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.712997317314148, + "num_tokens": 255296927.0, + "step": 9862 + }, + { + "epoch": 1.0831320008785417, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.240640640258789, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7356146574020386, + "num_tokens": 255323354.0, + "step": 9863 + }, + { + "epoch": 1.0832418185811552, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.133273124694824, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6908099055290222, + "num_tokens": 255356309.0, + "step": 9864 + }, + { + "epoch": 1.083351636283769, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.256037950515747, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7071373462677002, + "num_tokens": 255383162.0, + "step": 9865 + }, + { + "epoch": 1.0834614539863825, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.497662305831909, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6956347227096558, + "num_tokens": 255408460.0, + "step": 9866 + }, + { + "epoch": 1.0835712716889963, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.459094285964966, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7110612392425537, + "num_tokens": 255434169.0, + "step": 9867 + }, + { + "epoch": 1.08368108939161, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.6973774433135986, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6997759342193604, + "num_tokens": 255454970.0, + "step": 9868 + }, + { + "epoch": 1.0837909070942235, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.5746002197265625, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.732272207736969, + "num_tokens": 255475839.0, + "step": 9869 + }, + { + "epoch": 1.0839007247968373, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2858376502990723, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6900650262832642, + "num_tokens": 255505850.0, + "step": 9870 + }, + { + "epoch": 1.0840105424994508, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.26096510887146, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6914612054824829, + "num_tokens": 255536149.0, + "step": 9871 + }, + { + "epoch": 1.0841203602020646, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.545093059539795, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7151588201522827, + "num_tokens": 255561334.0, + "step": 9872 + }, + { + "epoch": 1.0842301779046783, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.288896322250366, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.705711841583252, + "num_tokens": 255589566.0, + "step": 9873 + }, + { + "epoch": 1.0843399956072919, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.4763967990875244, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7323426604270935, + "num_tokens": 255613867.0, + "step": 9874 + }, + { + "epoch": 1.0844498133099056, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.715531587600708, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7182205319404602, + "num_tokens": 255634796.0, + "step": 9875 + }, + { + "epoch": 1.0845596310125192, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.171660900115967, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6945804953575134, + "num_tokens": 255666912.0, + "step": 9876 + }, + { + "epoch": 1.084669448715133, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.2866687774658203, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7029052376747131, + "num_tokens": 255693497.0, + "step": 9877 + }, + { + "epoch": 1.0847792664177465, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 2.3229897022247314, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6992315053939819, + "num_tokens": 255719485.0, + "step": 9878 + }, + { + "epoch": 1.0848890841203602, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.397284746170044, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7106032967567444, + "num_tokens": 255743656.0, + "step": 9879 + }, + { + "epoch": 1.0849989018229738, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.169041872024536, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7026816606521606, + "num_tokens": 255774053.0, + "step": 9880 + }, + { + "epoch": 1.0851087195255875, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.807882070541382, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6983275413513184, + "num_tokens": 255794035.0, + "step": 9881 + }, + { + "epoch": 1.0852185372282013, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4633095264434814, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7423112392425537, + "num_tokens": 255819485.0, + "step": 9882 + }, + { + "epoch": 1.0853283549308148, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.480731725692749, + "learning_rate": 1e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.6839224100112915, + "num_tokens": 255844570.0, + "step": 9883 + }, + { + "epoch": 1.0854381726334286, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.346050262451172, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6876742243766785, + "num_tokens": 255873762.0, + "step": 9884 + }, + { + "epoch": 1.085547990336042, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5835888385772705, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7116490602493286, + "num_tokens": 255896096.0, + "step": 9885 + }, + { + "epoch": 1.0856578080386559, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1682674884796143, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6793568134307861, + "num_tokens": 255927815.0, + "step": 9886 + }, + { + "epoch": 1.0857676257412694, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.481977939605713, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.723939061164856, + "num_tokens": 255949988.0, + "step": 9887 + }, + { + "epoch": 1.0858774434438832, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.702119827270508, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.741369903087616, + "num_tokens": 255968301.0, + "step": 9888 + }, + { + "epoch": 1.085987261146497, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.521171808242798, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7134569883346558, + "num_tokens": 255991431.0, + "step": 9889 + }, + { + "epoch": 1.0860970788491104, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2159547805786133, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6878453493118286, + "num_tokens": 256023317.0, + "step": 9890 + }, + { + "epoch": 1.0862068965517242, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.9763376712799072, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7087287902832031, + "num_tokens": 256042239.0, + "step": 9891 + }, + { + "epoch": 1.0863167142543377, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.42417049407959, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7328478097915649, + "num_tokens": 256069235.0, + "step": 9892 + }, + { + "epoch": 1.0864265319569515, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4352896213531494, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7150514721870422, + "num_tokens": 256094727.0, + "step": 9893 + }, + { + "epoch": 1.086536349659565, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6409430503845215, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7252074480056763, + "num_tokens": 256116410.0, + "step": 9894 + }, + { + "epoch": 1.0866461673621788, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.485111951828003, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7240515947341919, + "num_tokens": 256140523.0, + "step": 9895 + }, + { + "epoch": 1.0867559850647925, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.140343427658081, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.724418044090271, + "num_tokens": 256173084.0, + "step": 9896 + }, + { + "epoch": 1.086865802767406, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4935286045074463, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6997482776641846, + "num_tokens": 256197211.0, + "step": 9897 + }, + { + "epoch": 1.0869756204700198, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5239174365997314, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7195104360580444, + "num_tokens": 256219446.0, + "step": 9898 + }, + { + "epoch": 1.0870854381726334, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.720830202102661, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7345967292785645, + "num_tokens": 256240081.0, + "step": 9899 + }, + { + "epoch": 1.0871952558752471, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2236132621765137, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7224860191345215, + "num_tokens": 256268252.0, + "step": 9900 + }, + { + "epoch": 1.0873050735778607, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.732424736022949, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7400696277618408, + "num_tokens": 256288236.0, + "step": 9901 + }, + { + "epoch": 1.0874148912804744, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.771498680114746, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7256430387496948, + "num_tokens": 256307306.0, + "step": 9902 + }, + { + "epoch": 1.0875247089830882, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2463600635528564, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6854801774024963, + "num_tokens": 256336305.0, + "step": 9903 + }, + { + "epoch": 1.0876345266857017, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.43209171295166, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7287890911102295, + "num_tokens": 256360289.0, + "step": 9904 + }, + { + "epoch": 1.0877443443883155, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1911842823028564, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7163529396057129, + "num_tokens": 256390078.0, + "step": 9905 + }, + { + "epoch": 1.087854162090929, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.30405592918396, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7030107975006104, + "num_tokens": 256420136.0, + "step": 9906 + }, + { + "epoch": 1.0879639797935428, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.266906499862671, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7138209342956543, + "num_tokens": 256448256.0, + "step": 9907 + }, + { + "epoch": 1.0880737974961563, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.232578754425049, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.699006974697113, + "num_tokens": 256477398.0, + "step": 9908 + }, + { + "epoch": 1.08818361519877, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5602567195892334, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7283505797386169, + "num_tokens": 256499772.0, + "step": 9909 + }, + { + "epoch": 1.0882934329013838, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3003270626068115, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7079540491104126, + "num_tokens": 256528746.0, + "step": 9910 + }, + { + "epoch": 1.0884032506039973, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6771373748779297, + "learning_rate": 1e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.676741361618042, + "num_tokens": 256553700.0, + "step": 9911 + }, + { + "epoch": 1.088513068306611, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.338639736175537, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7167149782180786, + "num_tokens": 256581267.0, + "step": 9912 + }, + { + "epoch": 1.0886228860092246, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3480913639068604, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6944441795349121, + "num_tokens": 256606458.0, + "step": 9913 + }, + { + "epoch": 1.0887327037118384, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4812614917755127, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.716852068901062, + "num_tokens": 256631026.0, + "step": 9914 + }, + { + "epoch": 1.088842521414452, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1972453594207764, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7010512948036194, + "num_tokens": 256659438.0, + "step": 9915 + }, + { + "epoch": 1.0889523391170657, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3870689868927, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6783392429351807, + "num_tokens": 256690266.0, + "step": 9916 + }, + { + "epoch": 1.0890621568196792, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.214050054550171, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7242047190666199, + "num_tokens": 256719995.0, + "step": 9917 + }, + { + "epoch": 1.089171974522293, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.616774082183838, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.704323410987854, + "num_tokens": 256741027.0, + "step": 9918 + }, + { + "epoch": 1.0892817922249067, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.536330223083496, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6817398071289062, + "num_tokens": 256764856.0, + "step": 9919 + }, + { + "epoch": 1.0893916099275203, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3064329624176025, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6965072751045227, + "num_tokens": 256791611.0, + "step": 9920 + }, + { + "epoch": 1.089501427630134, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.452366352081299, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7068158984184265, + "num_tokens": 256816799.0, + "step": 9921 + }, + { + "epoch": 1.0896112453327476, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.254689931869507, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.688189685344696, + "num_tokens": 256844587.0, + "step": 9922 + }, + { + "epoch": 1.0897210630353613, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.47749924659729, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7100781798362732, + "num_tokens": 256869769.0, + "step": 9923 + }, + { + "epoch": 1.089830880737975, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.233815908432007, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6869451403617859, + "num_tokens": 256902427.0, + "step": 9924 + }, + { + "epoch": 1.0899406984405886, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.389948844909668, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7068246603012085, + "num_tokens": 256928302.0, + "step": 9925 + }, + { + "epoch": 1.0900505161432024, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.697530746459961, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7001156210899353, + "num_tokens": 256949129.0, + "step": 9926 + }, + { + "epoch": 1.090160333845816, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2790911197662354, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7085431814193726, + "num_tokens": 256979446.0, + "step": 9927 + }, + { + "epoch": 1.0902701515484297, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.414233684539795, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7116982936859131, + "num_tokens": 257006538.0, + "step": 9928 + }, + { + "epoch": 1.0903799692510432, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3944177627563477, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7275618314743042, + "num_tokens": 257030709.0, + "step": 9929 + }, + { + "epoch": 1.090489786953657, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.444850444793701, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7208032011985779, + "num_tokens": 257054215.0, + "step": 9930 + }, + { + "epoch": 1.0905996046562705, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.285879611968994, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7123092412948608, + "num_tokens": 257080815.0, + "step": 9931 + }, + { + "epoch": 1.0907094223588842, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2973711490631104, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7197142839431763, + "num_tokens": 257105877.0, + "step": 9932 + }, + { + "epoch": 1.090819240061498, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5794034004211426, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7162981033325195, + "num_tokens": 257128601.0, + "step": 9933 + }, + { + "epoch": 1.0909290577641115, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.660538911819458, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7231284379959106, + "num_tokens": 257149597.0, + "step": 9934 + }, + { + "epoch": 1.0910388754667253, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6048831939697266, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7023109197616577, + "num_tokens": 257171312.0, + "step": 9935 + }, + { + "epoch": 1.0911486931693388, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1932449340820312, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6997467279434204, + "num_tokens": 257204631.0, + "step": 9936 + }, + { + "epoch": 1.0912585108719526, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.503605604171753, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.697517454624176, + "num_tokens": 257228960.0, + "step": 9937 + }, + { + "epoch": 1.0913683285745663, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.534456253051758, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7202211022377014, + "num_tokens": 257251455.0, + "step": 9938 + }, + { + "epoch": 1.0914781462771799, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4318583011627197, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7101667523384094, + "num_tokens": 257277132.0, + "step": 9939 + }, + { + "epoch": 1.0915879639797936, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.461385488510132, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.7065396308898926, + "num_tokens": 257301807.0, + "step": 9940 + }, + { + "epoch": 1.0916977816824072, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1296095848083496, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7029109001159668, + "num_tokens": 257333641.0, + "step": 9941 + }, + { + "epoch": 1.091807599385021, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.293031930923462, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7212383151054382, + "num_tokens": 257361424.0, + "step": 9942 + }, + { + "epoch": 1.0919174170876345, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6158061027526855, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7063483595848083, + "num_tokens": 257383212.0, + "step": 9943 + }, + { + "epoch": 1.0920272347902482, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.283925771713257, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7036805152893066, + "num_tokens": 257412798.0, + "step": 9944 + }, + { + "epoch": 1.0921370524928617, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.351594924926758, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7044536471366882, + "num_tokens": 257440412.0, + "step": 9945 + }, + { + "epoch": 1.0922468701954755, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.258805990219116, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7136305570602417, + "num_tokens": 257468699.0, + "step": 9946 + }, + { + "epoch": 1.0923566878980893, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.213512420654297, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6985558271408081, + "num_tokens": 257497606.0, + "step": 9947 + }, + { + "epoch": 1.0924665056007028, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5183773040771484, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7243792414665222, + "num_tokens": 257520184.0, + "step": 9948 + }, + { + "epoch": 1.0925763233033166, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.275588035583496, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6891681551933289, + "num_tokens": 257550058.0, + "step": 9949 + }, + { + "epoch": 1.09268614100593, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3446743488311768, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7040049433708191, + "num_tokens": 257577836.0, + "step": 9950 + }, + { + "epoch": 1.0927959587085438, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.0103917121887207, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6985111832618713, + "num_tokens": 257614681.0, + "step": 9951 + }, + { + "epoch": 1.0929057764111574, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.475255012512207, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7103408575057983, + "num_tokens": 257638113.0, + "step": 9952 + }, + { + "epoch": 1.0930155941137711, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.412837028503418, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.723862886428833, + "num_tokens": 257661459.0, + "step": 9953 + }, + { + "epoch": 1.093125411816385, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1207022666931152, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6917358636856079, + "num_tokens": 257694517.0, + "step": 9954 + }, + { + "epoch": 1.0932352295189984, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2309141159057617, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7219878435134888, + "num_tokens": 257721198.0, + "step": 9955 + }, + { + "epoch": 1.0933450472216122, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5451223850250244, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7342028617858887, + "num_tokens": 257741074.0, + "step": 9956 + }, + { + "epoch": 1.0934548649242257, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.433441638946533, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6999094486236572, + "num_tokens": 257767076.0, + "step": 9957 + }, + { + "epoch": 1.0935646826268395, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.171342611312866, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6860923171043396, + "num_tokens": 257798867.0, + "step": 9958 + }, + { + "epoch": 1.093674500329453, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.7154250144958496, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7211670875549316, + "num_tokens": 257821000.0, + "step": 9959 + }, + { + "epoch": 1.0937843180320668, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3799309730529785, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7313520908355713, + "num_tokens": 257845751.0, + "step": 9960 + }, + { + "epoch": 1.0938941357346805, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 3.840576648712158, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7139295339584351, + "num_tokens": 257870734.0, + "step": 9961 + }, + { + "epoch": 1.094003953437294, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.627992630004883, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7045348882675171, + "num_tokens": 257892104.0, + "step": 9962 + }, + { + "epoch": 1.0941137711399078, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4512689113616943, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.697978675365448, + "num_tokens": 257918169.0, + "step": 9963 + }, + { + "epoch": 1.0942235888425214, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.827411651611328, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7180227041244507, + "num_tokens": 257939361.0, + "step": 9964 + }, + { + "epoch": 1.0943334065451351, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.518315076828003, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7114462852478027, + "num_tokens": 257962990.0, + "step": 9965 + }, + { + "epoch": 1.0944432242477486, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.947350025177002, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7424754500389099, + "num_tokens": 257980096.0, + "step": 9966 + }, + { + "epoch": 1.0945530419503624, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.243685245513916, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7302087545394897, + "num_tokens": 258007207.0, + "step": 9967 + }, + { + "epoch": 1.0946628596529762, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.0243661403656006, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6938457489013672, + "num_tokens": 258043075.0, + "step": 9968 + }, + { + "epoch": 1.0947726773555897, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.1133291721343994, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7322466969490051, + "num_tokens": 258073828.0, + "step": 9969 + }, + { + "epoch": 1.0948824950582035, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.665783405303955, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6968331336975098, + "num_tokens": 258100626.0, + "step": 9970 + }, + { + "epoch": 1.094992312760817, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3828132152557373, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7461388111114502, + "num_tokens": 258125951.0, + "step": 9971 + }, + { + "epoch": 1.0951021304634307, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.245591402053833, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7286742925643921, + "num_tokens": 258151984.0, + "step": 9972 + }, + { + "epoch": 1.0952119481660443, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.414919376373291, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6929596066474915, + "num_tokens": 258177525.0, + "step": 9973 + }, + { + "epoch": 1.095321765868658, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 3.91835880279541, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7054033279418945, + "num_tokens": 258203254.0, + "step": 9974 + }, + { + "epoch": 1.0954315835712718, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.413957357406616, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7018802762031555, + "num_tokens": 258230192.0, + "step": 9975 + }, + { + "epoch": 1.0955414012738853, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5160133838653564, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6946873664855957, + "num_tokens": 258253607.0, + "step": 9976 + }, + { + "epoch": 1.095651218976499, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5515716075897217, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7133674621582031, + "num_tokens": 258276578.0, + "step": 9977 + }, + { + "epoch": 1.0957610366791126, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3954615592956543, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7075914144515991, + "num_tokens": 258302613.0, + "step": 9978 + }, + { + "epoch": 1.0958708543817264, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.019477128982544, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.69979327917099, + "num_tokens": 258337946.0, + "step": 9979 + }, + { + "epoch": 1.09598067208434, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.18133807182312, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6955476999282837, + "num_tokens": 258369426.0, + "step": 9980 + }, + { + "epoch": 1.0960904897869537, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6519052982330322, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7155904173851013, + "num_tokens": 258391493.0, + "step": 9981 + }, + { + "epoch": 1.0962003074895672, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.30203914642334, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7080150842666626, + "num_tokens": 258417199.0, + "step": 9982 + }, + { + "epoch": 1.096310125192181, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5946719646453857, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7045100331306458, + "num_tokens": 258438100.0, + "step": 9983 + }, + { + "epoch": 1.0964199428947947, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3135740756988525, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7342809438705444, + "num_tokens": 258463474.0, + "step": 9984 + }, + { + "epoch": 1.0965297605974083, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4699199199676514, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6967282891273499, + "num_tokens": 258488155.0, + "step": 9985 + }, + { + "epoch": 1.096639578300022, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4705891609191895, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7152631282806396, + "num_tokens": 258511403.0, + "step": 9986 + }, + { + "epoch": 1.0967493960026355, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6739280223846436, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6959960460662842, + "num_tokens": 258533843.0, + "step": 9987 + }, + { + "epoch": 1.0968592137052493, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4627010822296143, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7145352363586426, + "num_tokens": 258559380.0, + "step": 9988 + }, + { + "epoch": 1.096969031407863, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4136548042297363, + "learning_rate": 1e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.6718211770057678, + "num_tokens": 258589868.0, + "step": 9989 + }, + { + "epoch": 1.0970788491104766, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.612199306488037, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7302001714706421, + "num_tokens": 258611281.0, + "step": 9990 + }, + { + "epoch": 1.0971886668130904, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.301264524459839, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6914600729942322, + "num_tokens": 258639167.0, + "step": 9991 + }, + { + "epoch": 1.0972984845157039, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.330554246902466, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.711071252822876, + "num_tokens": 258665202.0, + "step": 9992 + }, + { + "epoch": 1.0974083022183176, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6946818828582764, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7240809202194214, + "num_tokens": 258686204.0, + "step": 9993 + }, + { + "epoch": 1.0975181199209312, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.294520616531372, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7222788333892822, + "num_tokens": 258712260.0, + "step": 9994 + }, + { + "epoch": 1.097627937623545, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.8032596111297607, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7171799540519714, + "num_tokens": 258732364.0, + "step": 9995 + }, + { + "epoch": 1.0977377553261585, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.444465160369873, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7401240468025208, + "num_tokens": 258758137.0, + "step": 9996 + }, + { + "epoch": 1.0978475730287722, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.506615400314331, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7076737880706787, + "num_tokens": 258781894.0, + "step": 9997 + }, + { + "epoch": 1.097957390731386, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 1.9469081163406372, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7202358841896057, + "num_tokens": 258816433.0, + "step": 9998 + }, + { + "epoch": 1.0980672084339995, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.476078987121582, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6960783004760742, + "num_tokens": 258842293.0, + "step": 9999 + }, + { + "epoch": 1.0981770261366133, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5216445922851562, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7045676708221436, + "num_tokens": 258868498.0, + "step": 10000 + }, + { + "epoch": 1.0982868438392268, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5201215744018555, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7071447968482971, + "num_tokens": 258894056.0, + "step": 10001 + }, + { + "epoch": 1.0983966615418406, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.7720954418182373, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7370637059211731, + "num_tokens": 258913204.0, + "step": 10002 + }, + { + "epoch": 1.098506479244454, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.634920358657837, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7432146072387695, + "num_tokens": 258934560.0, + "step": 10003 + }, + { + "epoch": 1.0986162969470679, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5925023555755615, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7286301851272583, + "num_tokens": 258957202.0, + "step": 10004 + }, + { + "epoch": 1.0987261146496816, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.6653518676757812, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.716594934463501, + "num_tokens": 258978611.0, + "step": 10005 + }, + { + "epoch": 1.0988359323522952, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.404003143310547, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7087137699127197, + "num_tokens": 259004896.0, + "step": 10006 + }, + { + "epoch": 1.098945750054909, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.7025251388549805, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7057234048843384, + "num_tokens": 259029899.0, + "step": 10007 + }, + { + "epoch": 1.0990555677575224, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.281067132949829, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7193917036056519, + "num_tokens": 259058605.0, + "step": 10008 + }, + { + "epoch": 1.0991653854601362, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4924895763397217, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7097408175468445, + "num_tokens": 259082524.0, + "step": 10009 + }, + { + "epoch": 1.0992752031627497, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.453244924545288, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7155623435974121, + "num_tokens": 259107190.0, + "step": 10010 + }, + { + "epoch": 1.0993850208653635, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.106966257095337, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7105894088745117, + "num_tokens": 259140725.0, + "step": 10011 + }, + { + "epoch": 1.0994948385679773, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.393342971801758, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6925585269927979, + "num_tokens": 259166114.0, + "step": 10012 + }, + { + "epoch": 1.0996046562705908, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5181682109832764, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7134968042373657, + "num_tokens": 259188512.0, + "step": 10013 + }, + { + "epoch": 1.0997144739732045, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.392831325531006, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7009914517402649, + "num_tokens": 259213322.0, + "step": 10014 + }, + { + "epoch": 1.099824291675818, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.118316888809204, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7038063406944275, + "num_tokens": 259244247.0, + "step": 10015 + }, + { + "epoch": 1.0999341093784318, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2742650508880615, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7197498083114624, + "num_tokens": 259272335.0, + "step": 10016 + }, + { + "epoch": 1.1000439270810454, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3142309188842773, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7173761129379272, + "num_tokens": 259299824.0, + "step": 10017 + }, + { + "epoch": 1.1001537447836591, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3809401988983154, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7215068340301514, + "num_tokens": 259322970.0, + "step": 10018 + }, + { + "epoch": 1.1002635624862729, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2871735095977783, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7058837413787842, + "num_tokens": 259352116.0, + "step": 10019 + }, + { + "epoch": 1.1003733801888864, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.177600622177124, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.69035804271698, + "num_tokens": 259383382.0, + "step": 10020 + }, + { + "epoch": 1.1004831978915002, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.139481782913208, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7117377519607544, + "num_tokens": 259412079.0, + "step": 10021 + }, + { + "epoch": 1.1005930155941137, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.127049207687378, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.705609917640686, + "num_tokens": 259445070.0, + "step": 10022 + }, + { + "epoch": 1.1007028332967275, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.259397268295288, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7110212445259094, + "num_tokens": 259473069.0, + "step": 10023 + }, + { + "epoch": 1.100812650999341, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.422689199447632, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.707999587059021, + "num_tokens": 259498640.0, + "step": 10024 + }, + { + "epoch": 1.1009224687019548, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3855628967285156, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6875944137573242, + "num_tokens": 259525249.0, + "step": 10025 + }, + { + "epoch": 1.1010322864045685, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.152130603790283, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6878398060798645, + "num_tokens": 259558002.0, + "step": 10026 + }, + { + "epoch": 1.101142104107182, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.173552989959717, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7181424498558044, + "num_tokens": 259587005.0, + "step": 10027 + }, + { + "epoch": 1.1012519218097958, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.636031150817871, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6952180862426758, + "num_tokens": 259608555.0, + "step": 10028 + }, + { + "epoch": 1.1013617395124093, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3936500549316406, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7091060876846313, + "num_tokens": 259632527.0, + "step": 10029 + }, + { + "epoch": 1.101471557215023, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.334315061569214, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7315707802772522, + "num_tokens": 259657114.0, + "step": 10030 + }, + { + "epoch": 1.1015813749176366, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.329864978790283, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7121200561523438, + "num_tokens": 259687056.0, + "step": 10031 + }, + { + "epoch": 1.1016911926202504, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.492434024810791, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7241072654724121, + "num_tokens": 259711041.0, + "step": 10032 + }, + { + "epoch": 1.1018010103228641, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3906021118164062, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6996578574180603, + "num_tokens": 259735799.0, + "step": 10033 + }, + { + "epoch": 1.1019108280254777, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5314102172851562, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7133110761642456, + "num_tokens": 259759245.0, + "step": 10034 + }, + { + "epoch": 1.1020206457280914, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.338726282119751, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7043961882591248, + "num_tokens": 259786430.0, + "step": 10035 + }, + { + "epoch": 1.102130463430705, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3678576946258545, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6958303451538086, + "num_tokens": 259812205.0, + "step": 10036 + }, + { + "epoch": 1.1022402811333187, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.586099147796631, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7131443023681641, + "num_tokens": 259834090.0, + "step": 10037 + }, + { + "epoch": 1.1023500988359323, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3833096027374268, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7108621597290039, + "num_tokens": 259859684.0, + "step": 10038 + }, + { + "epoch": 1.102459916538546, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2455904483795166, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7227777242660522, + "num_tokens": 259887694.0, + "step": 10039 + }, + { + "epoch": 1.1025697342411598, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3678572177886963, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6999121308326721, + "num_tokens": 259915361.0, + "step": 10040 + }, + { + "epoch": 1.1026795519437733, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5752956867218018, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7016886472702026, + "num_tokens": 259939604.0, + "step": 10041 + }, + { + "epoch": 1.102789369646387, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.338921308517456, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6968493461608887, + "num_tokens": 259964570.0, + "step": 10042 + }, + { + "epoch": 1.1028991873490006, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.452601909637451, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7226107716560364, + "num_tokens": 259988233.0, + "step": 10043 + }, + { + "epoch": 1.1030090050516144, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3608627319335938, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6970739364624023, + "num_tokens": 260016806.0, + "step": 10044 + }, + { + "epoch": 1.103118822754228, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3327419757843018, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7022264003753662, + "num_tokens": 260044011.0, + "step": 10045 + }, + { + "epoch": 1.1032286404568417, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5454318523406982, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7154774069786072, + "num_tokens": 260068734.0, + "step": 10046 + }, + { + "epoch": 1.1033384581594552, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3651649951934814, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7190248370170593, + "num_tokens": 260093825.0, + "step": 10047 + }, + { + "epoch": 1.103448275862069, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2370693683624268, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6844717860221863, + "num_tokens": 260122935.0, + "step": 10048 + }, + { + "epoch": 1.1035580935646827, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4425551891326904, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7224732041358948, + "num_tokens": 260148211.0, + "step": 10049 + }, + { + "epoch": 1.1036679112672962, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.535287380218506, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7197685241699219, + "num_tokens": 260171463.0, + "step": 10050 + }, + { + "epoch": 1.10377772896991, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.497208833694458, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7262429594993591, + "num_tokens": 260194896.0, + "step": 10051 + }, + { + "epoch": 1.1038875466725235, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 3.2489004135131836, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6925339102745056, + "num_tokens": 260218383.0, + "step": 10052 + }, + { + "epoch": 1.1039973643751373, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3849520683288574, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7000584602355957, + "num_tokens": 260244991.0, + "step": 10053 + }, + { + "epoch": 1.104107182077751, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4310414791107178, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7159374952316284, + "num_tokens": 260271284.0, + "step": 10054 + }, + { + "epoch": 1.1042169997803646, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.600595474243164, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6944669485092163, + "num_tokens": 260294880.0, + "step": 10055 + }, + { + "epoch": 1.1043268174829783, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.418118715286255, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6814754605293274, + "num_tokens": 260322375.0, + "step": 10056 + }, + { + "epoch": 1.1044366351855919, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3429017066955566, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6874975562095642, + "num_tokens": 260352339.0, + "step": 10057 + }, + { + "epoch": 1.1045464528882056, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5514166355133057, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7085734605789185, + "num_tokens": 260376081.0, + "step": 10058 + }, + { + "epoch": 1.1046562705908192, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.519857883453369, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7028378844261169, + "num_tokens": 260400360.0, + "step": 10059 + }, + { + "epoch": 1.104766088293433, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.362990140914917, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7136508226394653, + "num_tokens": 260427736.0, + "step": 10060 + }, + { + "epoch": 1.1048759059960465, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4238815307617188, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7051687240600586, + "num_tokens": 260451174.0, + "step": 10061 + }, + { + "epoch": 1.1049857236986602, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5632104873657227, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7063247561454773, + "num_tokens": 260474631.0, + "step": 10062 + }, + { + "epoch": 1.105095541401274, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3984806537628174, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7257477045059204, + "num_tokens": 260499111.0, + "step": 10063 + }, + { + "epoch": 1.1052053591038875, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.674058675765991, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7181311845779419, + "num_tokens": 260520362.0, + "step": 10064 + }, + { + "epoch": 1.1053151768065013, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3315916061401367, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7226600646972656, + "num_tokens": 260547511.0, + "step": 10065 + }, + { + "epoch": 1.1054249945091148, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5476741790771484, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7379290461540222, + "num_tokens": 260569000.0, + "step": 10066 + }, + { + "epoch": 1.1055348122117286, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.0143022537231445, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7170075178146362, + "num_tokens": 260602601.0, + "step": 10067 + }, + { + "epoch": 1.105644629914342, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.323194980621338, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6972019076347351, + "num_tokens": 260631014.0, + "step": 10068 + }, + { + "epoch": 1.1057544476169558, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4356110095977783, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7178398966789246, + "num_tokens": 260656564.0, + "step": 10069 + }, + { + "epoch": 1.1058642653195696, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4118263721466064, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7178134918212891, + "num_tokens": 260678529.0, + "step": 10070 + }, + { + "epoch": 1.1059740830221831, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3297436237335205, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7229708433151245, + "num_tokens": 260703650.0, + "step": 10071 + }, + { + "epoch": 1.106083900724797, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5745716094970703, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7121763825416565, + "num_tokens": 260725587.0, + "step": 10072 + }, + { + "epoch": 1.1061937184274104, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4433727264404297, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7300697565078735, + "num_tokens": 260749706.0, + "step": 10073 + }, + { + "epoch": 1.1063035361300242, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3471381664276123, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7293586730957031, + "num_tokens": 260775255.0, + "step": 10074 + }, + { + "epoch": 1.1064133538326377, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.2687909603118896, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7122280597686768, + "num_tokens": 260803395.0, + "step": 10075 + }, + { + "epoch": 1.1065231715352515, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.530517816543579, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7090083956718445, + "num_tokens": 260827511.0, + "step": 10076 + }, + { + "epoch": 1.1066329892378652, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.20538067817688, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7099634408950806, + "num_tokens": 260857517.0, + "step": 10077 + }, + { + "epoch": 1.1067428069404788, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.594743251800537, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7178398370742798, + "num_tokens": 260881444.0, + "step": 10078 + }, + { + "epoch": 1.1068526246430925, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5205769538879395, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.717630922794342, + "num_tokens": 260907718.0, + "step": 10079 + }, + { + "epoch": 1.106962442345706, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3827919960021973, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7062190771102905, + "num_tokens": 260934543.0, + "step": 10080 + }, + { + "epoch": 1.1070722600483198, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5354959964752197, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6969538927078247, + "num_tokens": 260960660.0, + "step": 10081 + }, + { + "epoch": 1.1071820777509334, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.329801559448242, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7134058475494385, + "num_tokens": 260988653.0, + "step": 10082 + }, + { + "epoch": 1.1072918954535471, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.309931516647339, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7030733823776245, + "num_tokens": 261017363.0, + "step": 10083 + }, + { + "epoch": 1.1074017131561609, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3202903270721436, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7046828269958496, + "num_tokens": 261044320.0, + "step": 10084 + }, + { + "epoch": 1.1075115308587744, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.337693691253662, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7344406843185425, + "num_tokens": 261069403.0, + "step": 10085 + }, + { + "epoch": 1.1076213485613882, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.800579309463501, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7168437838554382, + "num_tokens": 261088395.0, + "step": 10086 + }, + { + "epoch": 1.1077311662640017, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3981196880340576, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7271062135696411, + "num_tokens": 261113142.0, + "step": 10087 + }, + { + "epoch": 1.1078409839666155, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5914671421051025, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6862757205963135, + "num_tokens": 261135076.0, + "step": 10088 + }, + { + "epoch": 1.107950801669229, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.370755910873413, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7039132118225098, + "num_tokens": 261159556.0, + "step": 10089 + }, + { + "epoch": 1.1080606193718427, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.60189151763916, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7164925932884216, + "num_tokens": 261185178.0, + "step": 10090 + }, + { + "epoch": 1.1081704370744565, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.405372142791748, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7371649742126465, + "num_tokens": 261210986.0, + "step": 10091 + }, + { + "epoch": 1.10828025477707, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.172128915786743, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6967263221740723, + "num_tokens": 261242510.0, + "step": 10092 + }, + { + "epoch": 1.1083900724796838, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3458404541015625, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7104519009590149, + "num_tokens": 261267792.0, + "step": 10093 + }, + { + "epoch": 1.1084998901822973, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5236589908599854, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7199421525001526, + "num_tokens": 261293488.0, + "step": 10094 + }, + { + "epoch": 1.108609707884911, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.7458291053771973, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7169120907783508, + "num_tokens": 261313720.0, + "step": 10095 + }, + { + "epoch": 1.1087195255875246, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.9358747005462646, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7318692803382874, + "num_tokens": 261331486.0, + "step": 10096 + }, + { + "epoch": 1.1088293432901384, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.5485148429870605, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7113729119300842, + "num_tokens": 261353896.0, + "step": 10097 + }, + { + "epoch": 1.108939160992752, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.7401833534240723, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7046009302139282, + "num_tokens": 261376889.0, + "step": 10098 + }, + { + "epoch": 1.1090489786953657, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.820544958114624, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7143267393112183, + "num_tokens": 261395160.0, + "step": 10099 + }, + { + "epoch": 1.1091587963979794, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.814364433288574, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7160028219223022, + "num_tokens": 261421634.0, + "step": 10100 + }, + { + "epoch": 1.109268614100593, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.2903494834899902, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7100874781608582, + "num_tokens": 261450939.0, + "step": 10101 + }, + { + "epoch": 1.1093784318032067, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.58632755279541, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7156816720962524, + "num_tokens": 261474246.0, + "step": 10102 + }, + { + "epoch": 1.1094882495058203, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3970961570739746, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7309747934341431, + "num_tokens": 261499381.0, + "step": 10103 + }, + { + "epoch": 1.109598067208434, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.209268808364868, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7190128564834595, + "num_tokens": 261529173.0, + "step": 10104 + }, + { + "epoch": 1.1097078849110478, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.121757745742798, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7066048979759216, + "num_tokens": 261562491.0, + "step": 10105 + }, + { + "epoch": 1.1098177026136613, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.4507863521575928, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7149821519851685, + "num_tokens": 261587022.0, + "step": 10106 + }, + { + "epoch": 1.109927520316275, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.4117631912231445, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.7020409107208252, + "num_tokens": 261613628.0, + "step": 10107 + }, + { + "epoch": 1.1100373380188886, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.490356922149658, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7182550430297852, + "num_tokens": 261634952.0, + "step": 10108 + }, + { + "epoch": 1.1101471557215024, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.5059940814971924, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7207283973693848, + "num_tokens": 261657788.0, + "step": 10109 + }, + { + "epoch": 1.110256973424116, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.1739954948425293, + "learning_rate": 1e-06, + "loss": 1.1404, + "mean_token_accuracy": 0.6654415130615234, + "num_tokens": 261691464.0, + "step": 10110 + }, + { + "epoch": 1.1103667911267296, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.5686845779418945, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7249675393104553, + "num_tokens": 261713940.0, + "step": 10111 + }, + { + "epoch": 1.1104766088293432, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.4148292541503906, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7243534326553345, + "num_tokens": 261739358.0, + "step": 10112 + }, + { + "epoch": 1.110586426531957, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.3826253414154053, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7263412475585938, + "num_tokens": 261765350.0, + "step": 10113 + }, + { + "epoch": 1.1106962442345707, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.5212912559509277, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7108778953552246, + "num_tokens": 261790069.0, + "step": 10114 + }, + { + "epoch": 1.1108060619371842, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.428417205810547, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7147216796875, + "num_tokens": 261815604.0, + "step": 10115 + }, + { + "epoch": 1.110915879639798, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.370547294616699, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.696557879447937, + "num_tokens": 261841623.0, + "step": 10116 + }, + { + "epoch": 1.1110256973424115, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.717505693435669, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6911618709564209, + "num_tokens": 261862501.0, + "step": 10117 + }, + { + "epoch": 1.1111355150450253, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.304908275604248, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6942635774612427, + "num_tokens": 261890576.0, + "step": 10118 + }, + { + "epoch": 1.111245332747639, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3830361366271973, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.725733757019043, + "num_tokens": 261915171.0, + "step": 10119 + }, + { + "epoch": 1.1113551504502526, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3283276557922363, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.72602379322052, + "num_tokens": 261942007.0, + "step": 10120 + }, + { + "epoch": 1.1114649681528663, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3910558223724365, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6986309289932251, + "num_tokens": 261966752.0, + "step": 10121 + }, + { + "epoch": 1.1115747858554799, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.591062068939209, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7072770595550537, + "num_tokens": 261990830.0, + "step": 10122 + }, + { + "epoch": 1.1116846035580936, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.4581921100616455, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6996086239814758, + "num_tokens": 262015820.0, + "step": 10123 + }, + { + "epoch": 1.1117944212607072, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.8138272762298584, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7298696637153625, + "num_tokens": 262034653.0, + "step": 10124 + }, + { + "epoch": 1.111904238963321, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3528337478637695, + "learning_rate": 1e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.6782339811325073, + "num_tokens": 262065968.0, + "step": 10125 + }, + { + "epoch": 1.1120140566659344, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.046318531036377, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6970421075820923, + "num_tokens": 262102326.0, + "step": 10126 + }, + { + "epoch": 1.1121238743685482, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3041841983795166, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6977252960205078, + "num_tokens": 262129269.0, + "step": 10127 + }, + { + "epoch": 1.112233692071162, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.32328462600708, + "learning_rate": 1e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.6748225092887878, + "num_tokens": 262159214.0, + "step": 10128 + }, + { + "epoch": 1.1123435097737755, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.267254114151001, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6938556432723999, + "num_tokens": 262187605.0, + "step": 10129 + }, + { + "epoch": 1.1124533274763893, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.356085777282715, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6944218873977661, + "num_tokens": 262215300.0, + "step": 10130 + }, + { + "epoch": 1.1125631451790028, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 2.4784247875213623, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.715415358543396, + "num_tokens": 262237667.0, + "step": 10131 + }, + { + "epoch": 1.1126729628816165, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.5657665729522705, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7119061946868896, + "num_tokens": 262260847.0, + "step": 10132 + }, + { + "epoch": 1.11278278058423, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.2237160205841064, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7291780114173889, + "num_tokens": 262288164.0, + "step": 10133 + }, + { + "epoch": 1.1128925982868438, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.296403646469116, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.70997554063797, + "num_tokens": 262315357.0, + "step": 10134 + }, + { + "epoch": 1.1130024159894576, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.318514347076416, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7436290979385376, + "num_tokens": 262340817.0, + "step": 10135 + }, + { + "epoch": 1.1131122336920711, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.2957968711853027, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7150163054466248, + "num_tokens": 262369774.0, + "step": 10136 + }, + { + "epoch": 1.1132220513946849, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.430996894836426, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6986262798309326, + "num_tokens": 262395648.0, + "step": 10137 + }, + { + "epoch": 1.1133318690972984, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.32130765914917, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7114507555961609, + "num_tokens": 262421124.0, + "step": 10138 + }, + { + "epoch": 1.1134416867999122, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.2922983169555664, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7305803894996643, + "num_tokens": 262446550.0, + "step": 10139 + }, + { + "epoch": 1.1135515045025257, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.304370403289795, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7272050976753235, + "num_tokens": 262471616.0, + "step": 10140 + }, + { + "epoch": 1.1136613222051395, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.956342935562134, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.715096116065979, + "num_tokens": 262489414.0, + "step": 10141 + }, + { + "epoch": 1.1137711399077532, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.1444926261901855, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6970553398132324, + "num_tokens": 262526085.0, + "step": 10142 + }, + { + "epoch": 1.1138809576103668, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.518636703491211, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7083662748336792, + "num_tokens": 262550686.0, + "step": 10143 + }, + { + "epoch": 1.1139907753129805, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.382205009460449, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6945408582687378, + "num_tokens": 262575657.0, + "step": 10144 + }, + { + "epoch": 1.114100593015594, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.373562812805176, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.682440996170044, + "num_tokens": 262603127.0, + "step": 10145 + }, + { + "epoch": 1.1142104107182078, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.4253909587860107, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6900400519371033, + "num_tokens": 262628529.0, + "step": 10146 + }, + { + "epoch": 1.1143202284208213, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3427860736846924, + "learning_rate": 1e-06, + "loss": 1.1062, + "mean_token_accuracy": 0.6789840459823608, + "num_tokens": 262656454.0, + "step": 10147 + }, + { + "epoch": 1.114430046123435, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.170138359069824, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7325587272644043, + "num_tokens": 262685153.0, + "step": 10148 + }, + { + "epoch": 1.1145398638260489, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.2595951557159424, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6951416730880737, + "num_tokens": 262713813.0, + "step": 10149 + }, + { + "epoch": 1.1146496815286624, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.274285316467285, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7179479598999023, + "num_tokens": 262743376.0, + "step": 10150 + }, + { + "epoch": 1.1147594992312762, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.344196081161499, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7315118312835693, + "num_tokens": 262770450.0, + "step": 10151 + }, + { + "epoch": 1.1148693169338897, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.439592123031616, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6843649744987488, + "num_tokens": 262795402.0, + "step": 10152 + }, + { + "epoch": 1.1149791346365034, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.657541036605835, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.693820595741272, + "num_tokens": 262818722.0, + "step": 10153 + }, + { + "epoch": 1.115088952339117, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.459852457046509, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7329432368278503, + "num_tokens": 262841738.0, + "step": 10154 + }, + { + "epoch": 1.1151987700417307, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.3448731899261475, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7351641058921814, + "num_tokens": 262865964.0, + "step": 10155 + }, + { + "epoch": 1.1153085877443445, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 2.861201286315918, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6960783004760742, + "num_tokens": 262885839.0, + "step": 10156 + }, + { + "epoch": 1.115418405446958, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5417022705078125, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7168665528297424, + "num_tokens": 262908507.0, + "step": 10157 + }, + { + "epoch": 1.1155282231495718, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.607431650161743, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.713518500328064, + "num_tokens": 262931525.0, + "step": 10158 + }, + { + "epoch": 1.1156380408521853, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.7049691677093506, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6998321413993835, + "num_tokens": 262954414.0, + "step": 10159 + }, + { + "epoch": 1.115747858554799, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.50849986076355, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6990904808044434, + "num_tokens": 262976802.0, + "step": 10160 + }, + { + "epoch": 1.1158576762574126, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4174246788024902, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7104310989379883, + "num_tokens": 263006436.0, + "step": 10161 + }, + { + "epoch": 1.1159674939600264, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4124948978424072, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6942670345306396, + "num_tokens": 263033693.0, + "step": 10162 + }, + { + "epoch": 1.11607731166264, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.655379056930542, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.710846483707428, + "num_tokens": 263055731.0, + "step": 10163 + }, + { + "epoch": 1.1161871293652537, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4839277267456055, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7019484639167786, + "num_tokens": 263081564.0, + "step": 10164 + }, + { + "epoch": 1.1162969470678674, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.337961196899414, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7144001722335815, + "num_tokens": 263110244.0, + "step": 10165 + }, + { + "epoch": 1.116406764770481, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4555411338806152, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6970669627189636, + "num_tokens": 263135014.0, + "step": 10166 + }, + { + "epoch": 1.1165165824730947, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6330766677856445, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6924312114715576, + "num_tokens": 263156727.0, + "step": 10167 + }, + { + "epoch": 1.1166264001757082, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4059250354766846, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7170754671096802, + "num_tokens": 263183522.0, + "step": 10168 + }, + { + "epoch": 1.116736217878322, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.51177978515625, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7269067764282227, + "num_tokens": 263208367.0, + "step": 10169 + }, + { + "epoch": 1.1168460355809358, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.744717836380005, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7441768646240234, + "num_tokens": 263227569.0, + "step": 10170 + }, + { + "epoch": 1.1169558532835493, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2768239974975586, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7143479585647583, + "num_tokens": 263255747.0, + "step": 10171 + }, + { + "epoch": 1.117065670986163, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5915188789367676, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7129736542701721, + "num_tokens": 263277067.0, + "step": 10172 + }, + { + "epoch": 1.1171754886887766, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.468348503112793, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7179878354072571, + "num_tokens": 263300135.0, + "step": 10173 + }, + { + "epoch": 1.1172853063913903, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.863898754119873, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7284106016159058, + "num_tokens": 263318790.0, + "step": 10174 + }, + { + "epoch": 1.1173951240940039, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.249286651611328, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7063689231872559, + "num_tokens": 263348357.0, + "step": 10175 + }, + { + "epoch": 1.1175049417966176, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4412941932678223, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.729069709777832, + "num_tokens": 263372010.0, + "step": 10176 + }, + { + "epoch": 1.1176147594992312, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.390141010284424, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7052277326583862, + "num_tokens": 263397378.0, + "step": 10177 + }, + { + "epoch": 1.117724577201845, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.309577465057373, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.713225781917572, + "num_tokens": 263422840.0, + "step": 10178 + }, + { + "epoch": 1.1178343949044587, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.43123459815979, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.71439129114151, + "num_tokens": 263447140.0, + "step": 10179 + }, + { + "epoch": 1.1179442126070722, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.169689655303955, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6956586837768555, + "num_tokens": 263477950.0, + "step": 10180 + }, + { + "epoch": 1.118054030309686, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.066500425338745, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6912975311279297, + "num_tokens": 263511333.0, + "step": 10181 + }, + { + "epoch": 1.1181638480122995, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1733407974243164, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7055521011352539, + "num_tokens": 263539008.0, + "step": 10182 + }, + { + "epoch": 1.1182736657149133, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6784162521362305, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7202277183532715, + "num_tokens": 263559854.0, + "step": 10183 + }, + { + "epoch": 1.1183834834175268, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4942517280578613, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7110447883605957, + "num_tokens": 263583069.0, + "step": 10184 + }, + { + "epoch": 1.1184933011201406, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6074726581573486, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7465053796768188, + "num_tokens": 263603230.0, + "step": 10185 + }, + { + "epoch": 1.1186031188227543, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2462875843048096, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7024165391921997, + "num_tokens": 263632745.0, + "step": 10186 + }, + { + "epoch": 1.1187129365253679, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4889705181121826, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7010911703109741, + "num_tokens": 263656588.0, + "step": 10187 + }, + { + "epoch": 1.1188227542279816, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.612192153930664, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7219524383544922, + "num_tokens": 263677098.0, + "step": 10188 + }, + { + "epoch": 1.1189325719305951, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5130765438079834, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6966304779052734, + "num_tokens": 263700576.0, + "step": 10189 + }, + { + "epoch": 1.119042389633209, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2403407096862793, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7061010599136353, + "num_tokens": 263728753.0, + "step": 10190 + }, + { + "epoch": 1.1191522073358224, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1984739303588867, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7061311602592468, + "num_tokens": 263754501.0, + "step": 10191 + }, + { + "epoch": 1.1192620250384362, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2663567066192627, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6994566917419434, + "num_tokens": 263781638.0, + "step": 10192 + }, + { + "epoch": 1.11937184274105, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4412941932678223, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6952954530715942, + "num_tokens": 263806214.0, + "step": 10193 + }, + { + "epoch": 1.1194816604436635, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2138121128082275, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7035882472991943, + "num_tokens": 263834287.0, + "step": 10194 + }, + { + "epoch": 1.1195914781462772, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.364856719970703, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7063554525375366, + "num_tokens": 263861180.0, + "step": 10195 + }, + { + "epoch": 1.1197012958488908, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.598876476287842, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.72046959400177, + "num_tokens": 263883046.0, + "step": 10196 + }, + { + "epoch": 1.1198111135515045, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.327662467956543, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7168365120887756, + "num_tokens": 263910091.0, + "step": 10197 + }, + { + "epoch": 1.119920931254118, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1962502002716064, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7206931710243225, + "num_tokens": 263937454.0, + "step": 10198 + }, + { + "epoch": 1.1200307489567318, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3461506366729736, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7135941982269287, + "num_tokens": 263963185.0, + "step": 10199 + }, + { + "epoch": 1.1201405666593456, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6045339107513428, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7216852903366089, + "num_tokens": 263985334.0, + "step": 10200 + }, + { + "epoch": 1.1202503843619591, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3062760829925537, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6924256682395935, + "num_tokens": 264013050.0, + "step": 10201 + }, + { + "epoch": 1.1203602020645729, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4783291816711426, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6994478702545166, + "num_tokens": 264040065.0, + "step": 10202 + }, + { + "epoch": 1.1204700197671864, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2569591999053955, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6995928287506104, + "num_tokens": 264067824.0, + "step": 10203 + }, + { + "epoch": 1.1205798374698002, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2690043449401855, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6956363916397095, + "num_tokens": 264094078.0, + "step": 10204 + }, + { + "epoch": 1.1206896551724137, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2650084495544434, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7001820206642151, + "num_tokens": 264122206.0, + "step": 10205 + }, + { + "epoch": 1.1207994728750275, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5117170810699463, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.695075511932373, + "num_tokens": 264146490.0, + "step": 10206 + }, + { + "epoch": 1.1209092905776412, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 3.2864503860473633, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7345934510231018, + "num_tokens": 264162769.0, + "step": 10207 + }, + { + "epoch": 1.1210191082802548, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.456571340560913, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7182499766349792, + "num_tokens": 264185118.0, + "step": 10208 + }, + { + "epoch": 1.1211289259828685, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.151958465576172, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7052176594734192, + "num_tokens": 264217125.0, + "step": 10209 + }, + { + "epoch": 1.121238743685482, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.296013832092285, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7239279747009277, + "num_tokens": 264243982.0, + "step": 10210 + }, + { + "epoch": 1.1213485613880958, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.493628740310669, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6960374116897583, + "num_tokens": 264268132.0, + "step": 10211 + }, + { + "epoch": 1.1214583790907093, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.535823106765747, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7122210264205933, + "num_tokens": 264290913.0, + "step": 10212 + }, + { + "epoch": 1.121568196793323, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.200392961502075, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7253844738006592, + "num_tokens": 264317797.0, + "step": 10213 + }, + { + "epoch": 1.1216780144959368, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.516371726989746, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.707192063331604, + "num_tokens": 264343062.0, + "step": 10214 + }, + { + "epoch": 1.1217878321985504, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5440080165863037, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7061702013015747, + "num_tokens": 264365747.0, + "step": 10215 + }, + { + "epoch": 1.1218976499011641, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.459685802459717, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6996340155601501, + "num_tokens": 264389155.0, + "step": 10216 + }, + { + "epoch": 1.1220074676037777, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3968591690063477, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7545729279518127, + "num_tokens": 264412724.0, + "step": 10217 + }, + { + "epoch": 1.1221172853063914, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4401726722717285, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6970236301422119, + "num_tokens": 264437737.0, + "step": 10218 + }, + { + "epoch": 1.122227103009005, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4369068145751953, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6918168663978577, + "num_tokens": 264464141.0, + "step": 10219 + }, + { + "epoch": 1.1223369207116187, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.423321008682251, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7040104269981384, + "num_tokens": 264489951.0, + "step": 10220 + }, + { + "epoch": 1.1224467384142325, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.149812936782837, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6946316957473755, + "num_tokens": 264522420.0, + "step": 10221 + }, + { + "epoch": 1.122556556116846, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3482742309570312, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7032877206802368, + "num_tokens": 264550110.0, + "step": 10222 + }, + { + "epoch": 1.1226663738194598, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3500397205352783, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6960018873214722, + "num_tokens": 264577054.0, + "step": 10223 + }, + { + "epoch": 1.1227761915220733, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.464914083480835, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7071822285652161, + "num_tokens": 264602938.0, + "step": 10224 + }, + { + "epoch": 1.122886009224687, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3731203079223633, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7108267545700073, + "num_tokens": 264629059.0, + "step": 10225 + }, + { + "epoch": 1.1229958269273006, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3504626750946045, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7218332290649414, + "num_tokens": 264655928.0, + "step": 10226 + }, + { + "epoch": 1.1231056446299144, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.396369218826294, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7228960394859314, + "num_tokens": 264681093.0, + "step": 10227 + }, + { + "epoch": 1.123215462332528, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4043924808502197, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7233820557594299, + "num_tokens": 264705936.0, + "step": 10228 + }, + { + "epoch": 1.1233252800351416, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4024624824523926, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7031693458557129, + "num_tokens": 264731489.0, + "step": 10229 + }, + { + "epoch": 1.1234350977377554, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.562770366668701, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7283480167388916, + "num_tokens": 264753490.0, + "step": 10230 + }, + { + "epoch": 1.123544915440369, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.44415545463562, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.705653727054596, + "num_tokens": 264779633.0, + "step": 10231 + }, + { + "epoch": 1.1236547331429827, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3898651599884033, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6997755169868469, + "num_tokens": 264804496.0, + "step": 10232 + }, + { + "epoch": 1.1237645508455962, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3416194915771484, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6847429275512695, + "num_tokens": 264832883.0, + "step": 10233 + }, + { + "epoch": 1.12387436854821, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.350442409515381, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7244802117347717, + "num_tokens": 264858624.0, + "step": 10234 + }, + { + "epoch": 1.1239841862508237, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.7328755855560303, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7037001848220825, + "num_tokens": 264880895.0, + "step": 10235 + }, + { + "epoch": 1.1240940039534373, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3763155937194824, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6918144822120667, + "num_tokens": 264907135.0, + "step": 10236 + }, + { + "epoch": 1.124203821656051, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3617255687713623, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7056936025619507, + "num_tokens": 264934646.0, + "step": 10237 + }, + { + "epoch": 1.1243136393586646, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1720237731933594, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7134521007537842, + "num_tokens": 264963994.0, + "step": 10238 + }, + { + "epoch": 1.1244234570612783, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.477006435394287, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7159647941589355, + "num_tokens": 264988809.0, + "step": 10239 + }, + { + "epoch": 1.1245332747638919, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4708619117736816, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7194504737854004, + "num_tokens": 265011918.0, + "step": 10240 + }, + { + "epoch": 1.1246430924665056, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2547757625579834, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7076697945594788, + "num_tokens": 265040220.0, + "step": 10241 + }, + { + "epoch": 1.1247529101691192, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.39108943939209, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7248510122299194, + "num_tokens": 265066840.0, + "step": 10242 + }, + { + "epoch": 1.124862727871733, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6479365825653076, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7216721177101135, + "num_tokens": 265090591.0, + "step": 10243 + }, + { + "epoch": 1.1249725455743467, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.41780686378479, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.702119767665863, + "num_tokens": 265117912.0, + "step": 10244 + }, + { + "epoch": 1.1250823632769602, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.474552869796753, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7059478759765625, + "num_tokens": 265143335.0, + "step": 10245 + }, + { + "epoch": 1.125192180979574, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4732279777526855, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6861018538475037, + "num_tokens": 265168875.0, + "step": 10246 + }, + { + "epoch": 1.1253019986821875, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3546056747436523, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6899380683898926, + "num_tokens": 265195136.0, + "step": 10247 + }, + { + "epoch": 1.1254118163848013, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2878592014312744, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7048245668411255, + "num_tokens": 265222875.0, + "step": 10248 + }, + { + "epoch": 1.125521634087415, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.7563226222991943, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7146583795547485, + "num_tokens": 265242173.0, + "step": 10249 + }, + { + "epoch": 1.1256314517900285, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.459244966506958, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7306556701660156, + "num_tokens": 265265667.0, + "step": 10250 + }, + { + "epoch": 1.1257412694926423, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 7.190312385559082, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6885694265365601, + "num_tokens": 265295514.0, + "step": 10251 + }, + { + "epoch": 1.1258510871952558, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5740435123443604, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7290542125701904, + "num_tokens": 265317261.0, + "step": 10252 + }, + { + "epoch": 1.1259609048978696, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.593627691268921, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6934248805046082, + "num_tokens": 265342770.0, + "step": 10253 + }, + { + "epoch": 1.1260707226004831, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.7690162658691406, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7116690874099731, + "num_tokens": 265363346.0, + "step": 10254 + }, + { + "epoch": 1.1261805403030969, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.406761884689331, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.7072141766548157, + "num_tokens": 265391417.0, + "step": 10255 + }, + { + "epoch": 1.1262903580057104, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6457784175872803, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7360386848449707, + "num_tokens": 265412658.0, + "step": 10256 + }, + { + "epoch": 1.1264001757083242, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3013041019439697, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7168402671813965, + "num_tokens": 265439582.0, + "step": 10257 + }, + { + "epoch": 1.126509993410938, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3872861862182617, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.708518922328949, + "num_tokens": 265466946.0, + "step": 10258 + }, + { + "epoch": 1.1266198111135515, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1374218463897705, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7054784297943115, + "num_tokens": 265499854.0, + "step": 10259 + }, + { + "epoch": 1.1267296288161652, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6786344051361084, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7227588891983032, + "num_tokens": 265523938.0, + "step": 10260 + }, + { + "epoch": 1.1268394465187788, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.0119471549987793, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7102128267288208, + "num_tokens": 265560468.0, + "step": 10261 + }, + { + "epoch": 1.1269492642213925, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.573641777038574, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7013459801673889, + "num_tokens": 265583667.0, + "step": 10262 + }, + { + "epoch": 1.127059081924006, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.385826587677002, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7307973504066467, + "num_tokens": 265606595.0, + "step": 10263 + }, + { + "epoch": 1.1271688996266198, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.326784133911133, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.701779842376709, + "num_tokens": 265633660.0, + "step": 10264 + }, + { + "epoch": 1.1272787173292333, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.309718132019043, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.704497218132019, + "num_tokens": 265659487.0, + "step": 10265 + }, + { + "epoch": 1.127388535031847, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2501914501190186, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.701063334941864, + "num_tokens": 265688273.0, + "step": 10266 + }, + { + "epoch": 1.1274983527344609, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3904871940612793, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6887215375900269, + "num_tokens": 265713972.0, + "step": 10267 + }, + { + "epoch": 1.1276081704370744, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.443321704864502, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6879023909568787, + "num_tokens": 265739755.0, + "step": 10268 + }, + { + "epoch": 1.1277179881396882, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6635284423828125, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6880092024803162, + "num_tokens": 265764051.0, + "step": 10269 + }, + { + "epoch": 1.1278278058423017, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.413879871368408, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7058265805244446, + "num_tokens": 265789653.0, + "step": 10270 + }, + { + "epoch": 1.1279376235449154, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.281409978866577, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7217919826507568, + "num_tokens": 265816462.0, + "step": 10271 + }, + { + "epoch": 1.1280474412475292, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.6138455867767334, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7260222434997559, + "num_tokens": 265840306.0, + "step": 10272 + }, + { + "epoch": 1.1281572589501427, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1936745643615723, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7230507135391235, + "num_tokens": 265870347.0, + "step": 10273 + }, + { + "epoch": 1.1282670766527565, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.570697784423828, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6969063878059387, + "num_tokens": 265894363.0, + "step": 10274 + }, + { + "epoch": 1.12837689435537, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3030624389648438, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7072353363037109, + "num_tokens": 265921549.0, + "step": 10275 + }, + { + "epoch": 1.1284867120579838, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.294583559036255, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.715232253074646, + "num_tokens": 265948626.0, + "step": 10276 + }, + { + "epoch": 1.1285965297605973, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2469394207000732, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6996627449989319, + "num_tokens": 265983453.0, + "step": 10277 + }, + { + "epoch": 1.128706347463211, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.528620958328247, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7241241335868835, + "num_tokens": 266005444.0, + "step": 10278 + }, + { + "epoch": 1.1288161651658246, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.313539981842041, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7292986512184143, + "num_tokens": 266031241.0, + "step": 10279 + }, + { + "epoch": 1.1289259828684384, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.604492664337158, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7189558148384094, + "num_tokens": 266053070.0, + "step": 10280 + }, + { + "epoch": 1.1290358005710521, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.4585587978363037, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6952549815177917, + "num_tokens": 266076940.0, + "step": 10281 + }, + { + "epoch": 1.1291456182736657, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.0518946647644043, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6980228424072266, + "num_tokens": 266111542.0, + "step": 10282 + }, + { + "epoch": 1.1292554359762794, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3185627460479736, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7397109270095825, + "num_tokens": 266137534.0, + "step": 10283 + }, + { + "epoch": 1.129365253678893, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4770498275756836, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7241297960281372, + "num_tokens": 266161658.0, + "step": 10284 + }, + { + "epoch": 1.1294750713815067, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5209767818450928, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7098378539085388, + "num_tokens": 266184514.0, + "step": 10285 + }, + { + "epoch": 1.1295848890841205, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.3588080406188965, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6838803291320801, + "num_tokens": 266214113.0, + "step": 10286 + }, + { + "epoch": 1.129694706786734, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.5416648387908936, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7199729681015015, + "num_tokens": 266234590.0, + "step": 10287 + }, + { + "epoch": 1.1298045244893478, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2251505851745605, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7311431765556335, + "num_tokens": 266262165.0, + "step": 10288 + }, + { + "epoch": 1.1299143421919613, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.352839946746826, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7212711572647095, + "num_tokens": 266287184.0, + "step": 10289 + }, + { + "epoch": 1.130024159894575, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.8343288898468018, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.720665454864502, + "num_tokens": 266305758.0, + "step": 10290 + }, + { + "epoch": 1.1301339775971886, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.180842876434326, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7144598960876465, + "num_tokens": 266335294.0, + "step": 10291 + }, + { + "epoch": 1.1302437952998023, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.2734017372131348, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.7077189683914185, + "num_tokens": 266363544.0, + "step": 10292 + }, + { + "epoch": 1.1303536130024159, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.4637107849121094, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6813242435455322, + "num_tokens": 266389150.0, + "step": 10293 + }, + { + "epoch": 1.1304634307050296, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1791839599609375, + "learning_rate": 1e-06, + "loss": 1.0769, + "mean_token_accuracy": 0.6888495683670044, + "num_tokens": 266418930.0, + "step": 10294 + }, + { + "epoch": 1.1305732484076434, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3869454860687256, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6898202300071716, + "num_tokens": 266445519.0, + "step": 10295 + }, + { + "epoch": 1.130683066110257, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.7777769565582275, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7140964865684509, + "num_tokens": 266465326.0, + "step": 10296 + }, + { + "epoch": 1.1307928838128707, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.187610626220703, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7032870650291443, + "num_tokens": 266498034.0, + "step": 10297 + }, + { + "epoch": 1.1309027015154842, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.5899345874786377, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7038044333457947, + "num_tokens": 266523244.0, + "step": 10298 + }, + { + "epoch": 1.131012519218098, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.1612801551818848, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6916178464889526, + "num_tokens": 266556348.0, + "step": 10299 + }, + { + "epoch": 1.1311223369207117, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2702696323394775, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7028688192367554, + "num_tokens": 266585163.0, + "step": 10300 + }, + { + "epoch": 1.1312321546233253, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.445028781890869, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7280297875404358, + "num_tokens": 266608597.0, + "step": 10301 + }, + { + "epoch": 1.131341972325939, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.564687490463257, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7181576490402222, + "num_tokens": 266631533.0, + "step": 10302 + }, + { + "epoch": 1.1314517900285526, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.320051670074463, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6802116632461548, + "num_tokens": 266662321.0, + "step": 10303 + }, + { + "epoch": 1.1315616077311663, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.857475757598877, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7135130763053894, + "num_tokens": 266681531.0, + "step": 10304 + }, + { + "epoch": 1.1316714254337799, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2339885234832764, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7215812802314758, + "num_tokens": 266709453.0, + "step": 10305 + }, + { + "epoch": 1.1317812431363936, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3494510650634766, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7229853868484497, + "num_tokens": 266735508.0, + "step": 10306 + }, + { + "epoch": 1.1318910608390071, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.614103317260742, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6943867206573486, + "num_tokens": 266758362.0, + "step": 10307 + }, + { + "epoch": 1.132000878541621, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.296505928039551, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6994736194610596, + "num_tokens": 266784633.0, + "step": 10308 + }, + { + "epoch": 1.1321106962442347, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.372488498687744, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7069977521896362, + "num_tokens": 266811149.0, + "step": 10309 + }, + { + "epoch": 1.1322205139468482, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.916602373123169, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7251102924346924, + "num_tokens": 266829030.0, + "step": 10310 + }, + { + "epoch": 1.132330331649462, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2363221645355225, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7155576944351196, + "num_tokens": 266859845.0, + "step": 10311 + }, + { + "epoch": 1.1324401493520755, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.420492172241211, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7094097137451172, + "num_tokens": 266884529.0, + "step": 10312 + }, + { + "epoch": 1.1325499670546892, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.365715980529785, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6983953714370728, + "num_tokens": 266910291.0, + "step": 10313 + }, + { + "epoch": 1.132659784757303, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.492016553878784, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7302395105361938, + "num_tokens": 266937059.0, + "step": 10314 + }, + { + "epoch": 1.1327696024599165, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.4649035930633545, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6936206221580505, + "num_tokens": 266964277.0, + "step": 10315 + }, + { + "epoch": 1.1328794201625303, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.184889078140259, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7011439800262451, + "num_tokens": 266996463.0, + "step": 10316 + }, + { + "epoch": 1.1329892378651438, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.59679913520813, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7073701620101929, + "num_tokens": 267019702.0, + "step": 10317 + }, + { + "epoch": 1.1330990555677576, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.575117588043213, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7237229347229004, + "num_tokens": 267040614.0, + "step": 10318 + }, + { + "epoch": 1.1332088732703711, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.426366090774536, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7136232852935791, + "num_tokens": 267067908.0, + "step": 10319 + }, + { + "epoch": 1.1333186909729849, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.34999942779541, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7069965600967407, + "num_tokens": 267093389.0, + "step": 10320 + }, + { + "epoch": 1.1334285086755984, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.1666224002838135, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7033749222755432, + "num_tokens": 267123727.0, + "step": 10321 + }, + { + "epoch": 1.1335383263782122, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.231025457382202, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6971795558929443, + "num_tokens": 267153359.0, + "step": 10322 + }, + { + "epoch": 1.133648144080826, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3341081142425537, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7117067575454712, + "num_tokens": 267180260.0, + "step": 10323 + }, + { + "epoch": 1.1337579617834395, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2371983528137207, + "learning_rate": 1e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.6807389855384827, + "num_tokens": 267214943.0, + "step": 10324 + }, + { + "epoch": 1.1338677794860532, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.531130075454712, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.707112193107605, + "num_tokens": 267238721.0, + "step": 10325 + }, + { + "epoch": 1.1339775971886668, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.243875741958618, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7133527398109436, + "num_tokens": 267269105.0, + "step": 10326 + }, + { + "epoch": 1.1340874148912805, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.373359203338623, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.7037069797515869, + "num_tokens": 267295469.0, + "step": 10327 + }, + { + "epoch": 1.134197232593894, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2404584884643555, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6969590187072754, + "num_tokens": 267323179.0, + "step": 10328 + }, + { + "epoch": 1.1343070502965078, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 3.1945149898529053, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7134634256362915, + "num_tokens": 267347769.0, + "step": 10329 + }, + { + "epoch": 1.1344168679991213, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.495697498321533, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.708476185798645, + "num_tokens": 267373815.0, + "step": 10330 + }, + { + "epoch": 1.134526685701735, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.519089460372925, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7105791568756104, + "num_tokens": 267396441.0, + "step": 10331 + }, + { + "epoch": 1.1346365034043489, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.463590383529663, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7118890285491943, + "num_tokens": 267422653.0, + "step": 10332 + }, + { + "epoch": 1.1347463211069624, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.6311099529266357, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7182682752609253, + "num_tokens": 267445598.0, + "step": 10333 + }, + { + "epoch": 1.1348561388095761, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.326361894607544, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7229416370391846, + "num_tokens": 267471406.0, + "step": 10334 + }, + { + "epoch": 1.1349659565121897, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.560901641845703, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7089164853096008, + "num_tokens": 267494400.0, + "step": 10335 + }, + { + "epoch": 1.1350757742148034, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.264930486679077, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7223865985870361, + "num_tokens": 267524579.0, + "step": 10336 + }, + { + "epoch": 1.1351855919174172, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.1009442806243896, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7281070351600647, + "num_tokens": 267555767.0, + "step": 10337 + }, + { + "epoch": 1.1352954096200307, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.0732243061065674, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.706978440284729, + "num_tokens": 267589006.0, + "step": 10338 + }, + { + "epoch": 1.1354052273226445, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.5451571941375732, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7150543332099915, + "num_tokens": 267611517.0, + "step": 10339 + }, + { + "epoch": 1.135515045025258, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.1493568420410156, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6911371946334839, + "num_tokens": 267644383.0, + "step": 10340 + }, + { + "epoch": 1.1356248627278718, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.533670663833618, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6953942775726318, + "num_tokens": 267667731.0, + "step": 10341 + }, + { + "epoch": 1.1357346804304853, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.5871031284332275, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.7031164765357971, + "num_tokens": 267692049.0, + "step": 10342 + }, + { + "epoch": 1.135844498133099, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.349832773208618, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7192052006721497, + "num_tokens": 267719834.0, + "step": 10343 + }, + { + "epoch": 1.1359543158357126, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.9546525478363037, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7278815507888794, + "num_tokens": 267741202.0, + "step": 10344 + }, + { + "epoch": 1.1360641335383264, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.246455430984497, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6904953718185425, + "num_tokens": 267771467.0, + "step": 10345 + }, + { + "epoch": 1.1361739512409401, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.750312328338623, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7221734523773193, + "num_tokens": 267792675.0, + "step": 10346 + }, + { + "epoch": 1.1362837689435537, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.332648754119873, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6978796720504761, + "num_tokens": 267821926.0, + "step": 10347 + }, + { + "epoch": 1.1363935866461674, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2134342193603516, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7046997547149658, + "num_tokens": 267852223.0, + "step": 10348 + }, + { + "epoch": 1.136503404348781, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3967278003692627, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7044112682342529, + "num_tokens": 267876621.0, + "step": 10349 + }, + { + "epoch": 1.1366132220513947, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.078036308288574, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7063066959381104, + "num_tokens": 267908585.0, + "step": 10350 + }, + { + "epoch": 1.1367230397540085, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.223513126373291, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.7185969352722168, + "num_tokens": 267940588.0, + "step": 10351 + }, + { + "epoch": 1.136832857456622, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.4983022212982178, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7119232416152954, + "num_tokens": 267965779.0, + "step": 10352 + }, + { + "epoch": 1.1369426751592357, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3706507682800293, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6884409189224243, + "num_tokens": 267992571.0, + "step": 10353 + }, + { + "epoch": 1.1370524928618493, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3511526584625244, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.704843282699585, + "num_tokens": 268019872.0, + "step": 10354 + }, + { + "epoch": 1.137162310564463, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.482994794845581, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7163633108139038, + "num_tokens": 268043544.0, + "step": 10355 + }, + { + "epoch": 1.1372721282670766, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.33439040184021, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7062548995018005, + "num_tokens": 268069948.0, + "step": 10356 + }, + { + "epoch": 1.1373819459696903, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.770500421524048, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7049272060394287, + "num_tokens": 268089960.0, + "step": 10357 + }, + { + "epoch": 1.1374917636723039, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.575772523880005, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7241853475570679, + "num_tokens": 268111988.0, + "step": 10358 + }, + { + "epoch": 1.1376015813749176, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3937079906463623, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7210000157356262, + "num_tokens": 268137247.0, + "step": 10359 + }, + { + "epoch": 1.1377113990775314, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.381645441055298, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7392767071723938, + "num_tokens": 268161393.0, + "step": 10360 + }, + { + "epoch": 1.137821216780145, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3419816493988037, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.701897144317627, + "num_tokens": 268190923.0, + "step": 10361 + }, + { + "epoch": 1.1379310344827587, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.57843279838562, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.694635808467865, + "num_tokens": 268214444.0, + "step": 10362 + }, + { + "epoch": 1.1380408521853722, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.583857774734497, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7111345529556274, + "num_tokens": 268236428.0, + "step": 10363 + }, + { + "epoch": 1.138150669887986, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.6720051765441895, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7355194687843323, + "num_tokens": 268258604.0, + "step": 10364 + }, + { + "epoch": 1.1382604875905997, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.342838764190674, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7441832423210144, + "num_tokens": 268285344.0, + "step": 10365 + }, + { + "epoch": 1.1383703052932133, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.1845126152038574, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.69874107837677, + "num_tokens": 268320968.0, + "step": 10366 + }, + { + "epoch": 1.138480122995827, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.415243148803711, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7227106094360352, + "num_tokens": 268347096.0, + "step": 10367 + }, + { + "epoch": 1.1385899406984406, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.644113302230835, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7181326150894165, + "num_tokens": 268373691.0, + "step": 10368 + }, + { + "epoch": 1.1386997584010543, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.5424768924713135, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6974323987960815, + "num_tokens": 268398350.0, + "step": 10369 + }, + { + "epoch": 1.1388095761036678, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.523308277130127, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7182983756065369, + "num_tokens": 268422393.0, + "step": 10370 + }, + { + "epoch": 1.1389193938062816, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.367325782775879, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.736402153968811, + "num_tokens": 268447765.0, + "step": 10371 + }, + { + "epoch": 1.1390292115088951, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3173482418060303, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7173302173614502, + "num_tokens": 268476834.0, + "step": 10372 + }, + { + "epoch": 1.139139029211509, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.4707369804382324, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7194324731826782, + "num_tokens": 268501276.0, + "step": 10373 + }, + { + "epoch": 1.1392488469141226, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.132997989654541, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7144782543182373, + "num_tokens": 268533575.0, + "step": 10374 + }, + { + "epoch": 1.1393586646167362, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.64559268951416, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7314496040344238, + "num_tokens": 268554506.0, + "step": 10375 + }, + { + "epoch": 1.13946848231935, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.4285101890563965, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7131311297416687, + "num_tokens": 268579127.0, + "step": 10376 + }, + { + "epoch": 1.1395783000219635, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 7.145108699798584, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7023844122886658, + "num_tokens": 268602082.0, + "step": 10377 + }, + { + "epoch": 1.1396881177245772, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5214526653289795, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6934399604797363, + "num_tokens": 268629322.0, + "step": 10378 + }, + { + "epoch": 1.1397979354271908, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5223968029022217, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7304327487945557, + "num_tokens": 268654858.0, + "step": 10379 + }, + { + "epoch": 1.1399077531298045, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.703110456466675, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7169145345687866, + "num_tokens": 268674996.0, + "step": 10380 + }, + { + "epoch": 1.1400175708324183, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3892416954040527, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7000959515571594, + "num_tokens": 268701396.0, + "step": 10381 + }, + { + "epoch": 1.1401273885350318, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.539884567260742, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7102251052856445, + "num_tokens": 268725730.0, + "step": 10382 + }, + { + "epoch": 1.1402372062376456, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2898452281951904, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7287997007369995, + "num_tokens": 268756266.0, + "step": 10383 + }, + { + "epoch": 1.140347023940259, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.5598556995391846, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7269579768180847, + "num_tokens": 268779905.0, + "step": 10384 + }, + { + "epoch": 1.1404568416428729, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3880887031555176, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7194393873214722, + "num_tokens": 268805544.0, + "step": 10385 + }, + { + "epoch": 1.1405666593454864, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.3908376693725586, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7354545593261719, + "num_tokens": 268828418.0, + "step": 10386 + }, + { + "epoch": 1.1406764770481002, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.97550106048584, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7086461782455444, + "num_tokens": 268846625.0, + "step": 10387 + }, + { + "epoch": 1.140786294750714, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3790760040283203, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7076849341392517, + "num_tokens": 268871966.0, + "step": 10388 + }, + { + "epoch": 1.1408961124533274, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.302274703979492, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7343178987503052, + "num_tokens": 268897336.0, + "step": 10389 + }, + { + "epoch": 1.1410059301559412, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3989224433898926, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6989553570747375, + "num_tokens": 268924918.0, + "step": 10390 + }, + { + "epoch": 1.1411157478585547, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3247668743133545, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.712806224822998, + "num_tokens": 268950848.0, + "step": 10391 + }, + { + "epoch": 1.1412255655611685, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.368051767349243, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7029707431793213, + "num_tokens": 268976844.0, + "step": 10392 + }, + { + "epoch": 1.141335383263782, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3145244121551514, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7176595330238342, + "num_tokens": 269004861.0, + "step": 10393 + }, + { + "epoch": 1.1414452009663958, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6858553886413574, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7175469398498535, + "num_tokens": 269025965.0, + "step": 10394 + }, + { + "epoch": 1.1415550186690093, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.538256883621216, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7245148420333862, + "num_tokens": 269048403.0, + "step": 10395 + }, + { + "epoch": 1.141664836371623, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.343500852584839, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6860529184341431, + "num_tokens": 269077585.0, + "step": 10396 + }, + { + "epoch": 1.1417746540742368, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2042670249938965, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7347248792648315, + "num_tokens": 269108294.0, + "step": 10397 + }, + { + "epoch": 1.1418844717768504, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.340650796890259, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7045997381210327, + "num_tokens": 269135641.0, + "step": 10398 + }, + { + "epoch": 1.1419942894794641, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.153870105743408, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6888198852539062, + "num_tokens": 269165374.0, + "step": 10399 + }, + { + "epoch": 1.1421041071820777, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5616614818573, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7089220881462097, + "num_tokens": 269187117.0, + "step": 10400 + }, + { + "epoch": 1.1422139248846914, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 7.039425373077393, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7224323749542236, + "num_tokens": 269213528.0, + "step": 10401 + }, + { + "epoch": 1.1423237425873052, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3893773555755615, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7141232490539551, + "num_tokens": 269237503.0, + "step": 10402 + }, + { + "epoch": 1.1424335602899187, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3990318775177, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7288615703582764, + "num_tokens": 269263747.0, + "step": 10403 + }, + { + "epoch": 1.1425433779925325, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5775716304779053, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7114427089691162, + "num_tokens": 269286290.0, + "step": 10404 + }, + { + "epoch": 1.142653195695146, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5593907833099365, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7332417964935303, + "num_tokens": 269306380.0, + "step": 10405 + }, + { + "epoch": 1.1427630133977598, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5175368785858154, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7107486724853516, + "num_tokens": 269330612.0, + "step": 10406 + }, + { + "epoch": 1.1428728311003733, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4298994541168213, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7348918318748474, + "num_tokens": 269352725.0, + "step": 10407 + }, + { + "epoch": 1.142982648802987, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.0730204582214355, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7051684856414795, + "num_tokens": 269385926.0, + "step": 10408 + }, + { + "epoch": 1.1430924665056006, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3433785438537598, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6929132342338562, + "num_tokens": 269411889.0, + "step": 10409 + }, + { + "epoch": 1.1432022842082143, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5572574138641357, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6959807276725769, + "num_tokens": 269436910.0, + "step": 10410 + }, + { + "epoch": 1.143312101910828, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.314495801925659, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7034956216812134, + "num_tokens": 269464438.0, + "step": 10411 + }, + { + "epoch": 1.1434219196134416, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.326064109802246, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7040589451789856, + "num_tokens": 269491963.0, + "step": 10412 + }, + { + "epoch": 1.1435317373160554, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.290987014770508, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.705305278301239, + "num_tokens": 269521966.0, + "step": 10413 + }, + { + "epoch": 1.143641555018669, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1330373287200928, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7051101326942444, + "num_tokens": 269554796.0, + "step": 10414 + }, + { + "epoch": 1.1437513727212827, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4235029220581055, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6970461010932922, + "num_tokens": 269580105.0, + "step": 10415 + }, + { + "epoch": 1.1438611904238964, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.248120069503784, + "learning_rate": 1e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6813089847564697, + "num_tokens": 269608531.0, + "step": 10416 + }, + { + "epoch": 1.14397100812651, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.255852699279785, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6984730958938599, + "num_tokens": 269638760.0, + "step": 10417 + }, + { + "epoch": 1.1440808258291237, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3660058975219727, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6894205808639526, + "num_tokens": 269666361.0, + "step": 10418 + }, + { + "epoch": 1.1441906435317373, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.283224582672119, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7048313617706299, + "num_tokens": 269692763.0, + "step": 10419 + }, + { + "epoch": 1.144300461234351, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.143061637878418, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6927915215492249, + "num_tokens": 269727061.0, + "step": 10420 + }, + { + "epoch": 1.1444102789369646, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3994252681732178, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.735802412033081, + "num_tokens": 269752838.0, + "step": 10421 + }, + { + "epoch": 1.1445200966395783, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2310266494750977, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6992805004119873, + "num_tokens": 269783618.0, + "step": 10422 + }, + { + "epoch": 1.1446299143421919, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2208328247070312, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7077867984771729, + "num_tokens": 269813083.0, + "step": 10423 + }, + { + "epoch": 1.1447397320448056, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3740928173065186, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7130551934242249, + "num_tokens": 269841125.0, + "step": 10424 + }, + { + "epoch": 1.1448495497474194, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3235208988189697, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.734434962272644, + "num_tokens": 269865835.0, + "step": 10425 + }, + { + "epoch": 1.144959367450033, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.540369987487793, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7034163475036621, + "num_tokens": 269888757.0, + "step": 10426 + }, + { + "epoch": 1.1450691851526467, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5835843086242676, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7024154663085938, + "num_tokens": 269912248.0, + "step": 10427 + }, + { + "epoch": 1.1451790028552602, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5530173778533936, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7290433645248413, + "num_tokens": 269936794.0, + "step": 10428 + }, + { + "epoch": 1.145288820557874, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.356144905090332, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7108308672904968, + "num_tokens": 269962722.0, + "step": 10429 + }, + { + "epoch": 1.1453986382604877, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.589426040649414, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7144337892532349, + "num_tokens": 269986040.0, + "step": 10430 + }, + { + "epoch": 1.1455084559631012, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1891703605651855, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7081637382507324, + "num_tokens": 270018188.0, + "step": 10431 + }, + { + "epoch": 1.145618273665715, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.309753656387329, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7237170338630676, + "num_tokens": 270045447.0, + "step": 10432 + }, + { + "epoch": 1.1457280913683285, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.417759418487549, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6978667974472046, + "num_tokens": 270073881.0, + "step": 10433 + }, + { + "epoch": 1.1458379090709423, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5457191467285156, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7167677283287048, + "num_tokens": 270097232.0, + "step": 10434 + }, + { + "epoch": 1.1459477267735558, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5108115673065186, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7022472023963928, + "num_tokens": 270122091.0, + "step": 10435 + }, + { + "epoch": 1.1460575444761696, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4020705223083496, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6914920210838318, + "num_tokens": 270147807.0, + "step": 10436 + }, + { + "epoch": 1.1461673621787831, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.520110607147217, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7162330746650696, + "num_tokens": 270169830.0, + "step": 10437 + }, + { + "epoch": 1.1462771798813969, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.506721258163452, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7184459567070007, + "num_tokens": 270194328.0, + "step": 10438 + }, + { + "epoch": 1.1463869975840106, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3509490489959717, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7110768556594849, + "num_tokens": 270219357.0, + "step": 10439 + }, + { + "epoch": 1.1464968152866242, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.592021942138672, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7054831385612488, + "num_tokens": 270241779.0, + "step": 10440 + }, + { + "epoch": 1.146606632989238, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.7575387954711914, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.720274806022644, + "num_tokens": 270261733.0, + "step": 10441 + }, + { + "epoch": 1.1467164506918515, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.558466911315918, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7134280204772949, + "num_tokens": 270283333.0, + "step": 10442 + }, + { + "epoch": 1.1468262683944652, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.213658332824707, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7051739692687988, + "num_tokens": 270315335.0, + "step": 10443 + }, + { + "epoch": 1.1469360860970788, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.247939348220825, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7247169613838196, + "num_tokens": 270342964.0, + "step": 10444 + }, + { + "epoch": 1.1470459037996925, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6398980617523193, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6912320256233215, + "num_tokens": 270368653.0, + "step": 10445 + }, + { + "epoch": 1.147155721502306, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.493208646774292, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7052269577980042, + "num_tokens": 270393362.0, + "step": 10446 + }, + { + "epoch": 1.1472655392049198, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4486289024353027, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7277513742446899, + "num_tokens": 270416873.0, + "step": 10447 + }, + { + "epoch": 1.1473753569075336, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.459670305252075, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7175136804580688, + "num_tokens": 270440103.0, + "step": 10448 + }, + { + "epoch": 1.147485174610147, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.601444721221924, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7154074311256409, + "num_tokens": 270462076.0, + "step": 10449 + }, + { + "epoch": 1.1475949923127609, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.326145887374878, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.696262001991272, + "num_tokens": 270488307.0, + "step": 10450 + }, + { + "epoch": 1.1477048100153744, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2796361446380615, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7150951623916626, + "num_tokens": 270517987.0, + "step": 10451 + }, + { + "epoch": 1.1478146277179881, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4626352787017822, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6996266841888428, + "num_tokens": 270543828.0, + "step": 10452 + }, + { + "epoch": 1.147924445420602, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4091808795928955, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6986193656921387, + "num_tokens": 270570706.0, + "step": 10453 + }, + { + "epoch": 1.1480342631232154, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.347130298614502, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7290420532226562, + "num_tokens": 270596739.0, + "step": 10454 + }, + { + "epoch": 1.1481440808258292, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1630077362060547, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7294459342956543, + "num_tokens": 270623788.0, + "step": 10455 + }, + { + "epoch": 1.1482538985284427, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.397984027862549, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7253148555755615, + "num_tokens": 270649744.0, + "step": 10456 + }, + { + "epoch": 1.1483637162310565, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6283726692199707, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7231137752532959, + "num_tokens": 270671889.0, + "step": 10457 + }, + { + "epoch": 1.14847353393367, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6549720764160156, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7472441792488098, + "num_tokens": 270692169.0, + "step": 10458 + }, + { + "epoch": 1.1485833516362838, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.383836507797241, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7217174768447876, + "num_tokens": 270716963.0, + "step": 10459 + }, + { + "epoch": 1.1486931693388973, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3148741722106934, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7113499641418457, + "num_tokens": 270747232.0, + "step": 10460 + }, + { + "epoch": 1.148802987041511, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.208178997039795, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6995396614074707, + "num_tokens": 270777490.0, + "step": 10461 + }, + { + "epoch": 1.1489128047441248, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.550992488861084, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7193026542663574, + "num_tokens": 270799558.0, + "step": 10462 + }, + { + "epoch": 1.1490226224467384, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.4127981662750244, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7005606293678284, + "num_tokens": 270824945.0, + "step": 10463 + }, + { + "epoch": 1.1491324401493521, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.006992816925049, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6986609697341919, + "num_tokens": 270859452.0, + "step": 10464 + }, + { + "epoch": 1.1492422578519657, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2032318115234375, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7152930498123169, + "num_tokens": 270889401.0, + "step": 10465 + }, + { + "epoch": 1.1493520755545794, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.8736307621002197, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7213820219039917, + "num_tokens": 270911780.0, + "step": 10466 + }, + { + "epoch": 1.1494618932571932, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.442887306213379, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7072800397872925, + "num_tokens": 270939541.0, + "step": 10467 + }, + { + "epoch": 1.1495717109598067, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5031120777130127, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7088291645050049, + "num_tokens": 270961764.0, + "step": 10468 + }, + { + "epoch": 1.1496815286624205, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.644374132156372, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7199836373329163, + "num_tokens": 270982189.0, + "step": 10469 + }, + { + "epoch": 1.149791346365034, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.9363484382629395, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7145909070968628, + "num_tokens": 271000362.0, + "step": 10470 + }, + { + "epoch": 1.1499011640676478, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5188074111938477, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7200196981430054, + "num_tokens": 271023383.0, + "step": 10471 + }, + { + "epoch": 1.1500109817702613, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3077094554901123, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7251861691474915, + "num_tokens": 271052828.0, + "step": 10472 + }, + { + "epoch": 1.150120799472875, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.332148313522339, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6941238641738892, + "num_tokens": 271081617.0, + "step": 10473 + }, + { + "epoch": 1.1502306171754886, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.385108232498169, + "learning_rate": 1e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6737501621246338, + "num_tokens": 271110567.0, + "step": 10474 + }, + { + "epoch": 1.1503404348781023, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.472205400466919, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.702419638633728, + "num_tokens": 271134094.0, + "step": 10475 + }, + { + "epoch": 1.150450252580716, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.434028148651123, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7313410639762878, + "num_tokens": 271159825.0, + "step": 10476 + }, + { + "epoch": 1.1505600702833296, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5230154991149902, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7246156930923462, + "num_tokens": 271179892.0, + "step": 10477 + }, + { + "epoch": 1.1506698879859434, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.333920955657959, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7075197100639343, + "num_tokens": 271206416.0, + "step": 10478 + }, + { + "epoch": 1.150779705688557, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2141222953796387, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6850681304931641, + "num_tokens": 271235255.0, + "step": 10479 + }, + { + "epoch": 1.1508895233911707, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.053478956222534, + "learning_rate": 1e-06, + "loss": 1.1189, + "mean_token_accuracy": 0.672844409942627, + "num_tokens": 271273431.0, + "step": 10480 + }, + { + "epoch": 1.1509993410937844, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.526531457901001, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6947009563446045, + "num_tokens": 271296536.0, + "step": 10481 + }, + { + "epoch": 1.151109158796398, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1054444313049316, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.686084508895874, + "num_tokens": 271331104.0, + "step": 10482 + }, + { + "epoch": 1.1512189764990117, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.379065752029419, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.711754560470581, + "num_tokens": 271358864.0, + "step": 10483 + }, + { + "epoch": 1.1513287942016253, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.26153826713562, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6997276544570923, + "num_tokens": 271385636.0, + "step": 10484 + }, + { + "epoch": 1.151438611904239, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.599386692047119, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7192808389663696, + "num_tokens": 271406751.0, + "step": 10485 + }, + { + "epoch": 1.1515484296068526, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.321610927581787, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6966395378112793, + "num_tokens": 271433759.0, + "step": 10486 + }, + { + "epoch": 1.1516582473094663, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 8.679656982421875, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7266825437545776, + "num_tokens": 271453057.0, + "step": 10487 + }, + { + "epoch": 1.1517680650120798, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4776298999786377, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.713485062122345, + "num_tokens": 271478297.0, + "step": 10488 + }, + { + "epoch": 1.1518778827146936, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.553834915161133, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7180391550064087, + "num_tokens": 271500563.0, + "step": 10489 + }, + { + "epoch": 1.1519877004173074, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1343348026275635, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6956260204315186, + "num_tokens": 271533148.0, + "step": 10490 + }, + { + "epoch": 1.152097518119921, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.351691722869873, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7058789134025574, + "num_tokens": 271561841.0, + "step": 10491 + }, + { + "epoch": 1.1522073358225347, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.600681781768799, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7257117629051208, + "num_tokens": 271583226.0, + "step": 10492 + }, + { + "epoch": 1.1523171535251482, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5770812034606934, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7237920761108398, + "num_tokens": 271605953.0, + "step": 10493 + }, + { + "epoch": 1.152426971227762, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.416499614715576, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7123301029205322, + "num_tokens": 271632154.0, + "step": 10494 + }, + { + "epoch": 1.1525367889303757, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6592907905578613, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7143276333808899, + "num_tokens": 271653841.0, + "step": 10495 + }, + { + "epoch": 1.1526466066329892, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.453930377960205, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.714081883430481, + "num_tokens": 271677977.0, + "step": 10496 + }, + { + "epoch": 1.152756424335603, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4878151416778564, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7111485600471497, + "num_tokens": 271701221.0, + "step": 10497 + }, + { + "epoch": 1.1528662420382165, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.215717315673828, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.689031720161438, + "num_tokens": 271732023.0, + "step": 10498 + }, + { + "epoch": 1.1529760597408303, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.442070484161377, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7168906927108765, + "num_tokens": 271757810.0, + "step": 10499 + }, + { + "epoch": 1.1530858774434438, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.947498321533203, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7208713293075562, + "num_tokens": 271775593.0, + "step": 10500 + }, + { + "epoch": 1.1531956951460576, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2961864471435547, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7205236554145813, + "num_tokens": 271802380.0, + "step": 10501 + }, + { + "epoch": 1.153305512848671, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3191416263580322, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7059725522994995, + "num_tokens": 271829159.0, + "step": 10502 + }, + { + "epoch": 1.1534153305512849, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6272695064544678, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7058449983596802, + "num_tokens": 271853961.0, + "step": 10503 + }, + { + "epoch": 1.1535251482538986, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2394402027130127, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7243810892105103, + "num_tokens": 271880431.0, + "step": 10504 + }, + { + "epoch": 1.1536349659565122, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.1798222064971924, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.713733434677124, + "num_tokens": 271908730.0, + "step": 10505 + }, + { + "epoch": 1.153744783659126, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.4292550086975098, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7286629676818848, + "num_tokens": 271931465.0, + "step": 10506 + }, + { + "epoch": 1.1538546013617395, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5384044647216797, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.715721845626831, + "num_tokens": 271957167.0, + "step": 10507 + }, + { + "epoch": 1.1539644190643532, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3707022666931152, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.726190984249115, + "num_tokens": 271982253.0, + "step": 10508 + }, + { + "epoch": 1.1540742367669667, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.1477150917053223, + "learning_rate": 1e-06, + "loss": 1.1251, + "mean_token_accuracy": 0.6751852035522461, + "num_tokens": 272014376.0, + "step": 10509 + }, + { + "epoch": 1.1541840544695805, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.265733480453491, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7129703760147095, + "num_tokens": 272043291.0, + "step": 10510 + }, + { + "epoch": 1.154293872172194, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2782211303710938, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6914478540420532, + "num_tokens": 272071838.0, + "step": 10511 + }, + { + "epoch": 1.1544036898748078, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.27632474899292, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7042770981788635, + "num_tokens": 272099074.0, + "step": 10512 + }, + { + "epoch": 1.1545135075774215, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.6969058513641357, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7210091352462769, + "num_tokens": 272121508.0, + "step": 10513 + }, + { + "epoch": 1.154623325280035, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.447561025619507, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7267785668373108, + "num_tokens": 272145371.0, + "step": 10514 + }, + { + "epoch": 1.1547331429826488, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.4868860244750977, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7006251811981201, + "num_tokens": 272170042.0, + "step": 10515 + }, + { + "epoch": 1.1548429606852624, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2549190521240234, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.727786123752594, + "num_tokens": 272197860.0, + "step": 10516 + }, + { + "epoch": 1.1549527783878761, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.7269790172576904, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7203072905540466, + "num_tokens": 272219986.0, + "step": 10517 + }, + { + "epoch": 1.15506259609049, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.4195480346679688, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.704792857170105, + "num_tokens": 272247393.0, + "step": 10518 + }, + { + "epoch": 1.1551724137931034, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.6892762184143066, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7328321933746338, + "num_tokens": 272266856.0, + "step": 10519 + }, + { + "epoch": 1.1552822314957172, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.261603593826294, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6957882642745972, + "num_tokens": 272296842.0, + "step": 10520 + }, + { + "epoch": 1.1553920491983307, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2903616428375244, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6853206157684326, + "num_tokens": 272324349.0, + "step": 10521 + }, + { + "epoch": 1.1555018669009445, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.27077054977417, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7263398170471191, + "num_tokens": 272351169.0, + "step": 10522 + }, + { + "epoch": 1.155611684603558, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5885958671569824, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.7007182836532593, + "num_tokens": 272375508.0, + "step": 10523 + }, + { + "epoch": 1.1557215023061718, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.079252004623413, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7113051414489746, + "num_tokens": 272405700.0, + "step": 10524 + }, + { + "epoch": 1.1558313200087853, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.24897837638855, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6839689612388611, + "num_tokens": 272435400.0, + "step": 10525 + }, + { + "epoch": 1.155941137711399, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3978216648101807, + "learning_rate": 1e-06, + "loss": 1.1522, + "mean_token_accuracy": 0.6728302240371704, + "num_tokens": 272462424.0, + "step": 10526 + }, + { + "epoch": 1.1560509554140128, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2649059295654297, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6884176731109619, + "num_tokens": 272492794.0, + "step": 10527 + }, + { + "epoch": 1.1561607731166264, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3952248096466064, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7591606974601746, + "num_tokens": 272515654.0, + "step": 10528 + }, + { + "epoch": 1.15627059081924, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.511503219604492, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7123094797134399, + "num_tokens": 272540293.0, + "step": 10529 + }, + { + "epoch": 1.1563804085218536, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.276170492172241, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7158359289169312, + "num_tokens": 272570385.0, + "step": 10530 + }, + { + "epoch": 1.1564902262244674, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5837323665618896, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7053470015525818, + "num_tokens": 272591512.0, + "step": 10531 + }, + { + "epoch": 1.1566000439270812, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.674386978149414, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7124653458595276, + "num_tokens": 272614670.0, + "step": 10532 + }, + { + "epoch": 1.1567098616296947, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3341221809387207, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7093645334243774, + "num_tokens": 272640736.0, + "step": 10533 + }, + { + "epoch": 1.1568196793323084, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3635456562042236, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7117136120796204, + "num_tokens": 272665179.0, + "step": 10534 + }, + { + "epoch": 1.156929497034922, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2194511890411377, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7229578495025635, + "num_tokens": 272693745.0, + "step": 10535 + }, + { + "epoch": 1.1570393147375357, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.713343620300293, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7199315428733826, + "num_tokens": 272714068.0, + "step": 10536 + }, + { + "epoch": 1.1571491324401493, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.497575521469116, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6966793537139893, + "num_tokens": 272737718.0, + "step": 10537 + }, + { + "epoch": 1.157258950142763, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5685384273529053, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7181837558746338, + "num_tokens": 272760243.0, + "step": 10538 + }, + { + "epoch": 1.1573687678453766, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.2191147804260254, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.702184796333313, + "num_tokens": 272788577.0, + "step": 10539 + }, + { + "epoch": 1.1574785855479903, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3590450286865234, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6842918395996094, + "num_tokens": 272816076.0, + "step": 10540 + }, + { + "epoch": 1.157588403250604, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.647639274597168, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7132712602615356, + "num_tokens": 272838322.0, + "step": 10541 + }, + { + "epoch": 1.1576982209532176, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4518380165100098, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7308915853500366, + "num_tokens": 272864169.0, + "step": 10542 + }, + { + "epoch": 1.1578080386558314, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4584157466888428, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7192010283470154, + "num_tokens": 272888136.0, + "step": 10543 + }, + { + "epoch": 1.157917856358445, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3869268894195557, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.699566662311554, + "num_tokens": 272912632.0, + "step": 10544 + }, + { + "epoch": 1.1580276740610587, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3715357780456543, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7054761648178101, + "num_tokens": 272939621.0, + "step": 10545 + }, + { + "epoch": 1.1581374917636724, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5191609859466553, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.7001504898071289, + "num_tokens": 272963177.0, + "step": 10546 + }, + { + "epoch": 1.158247309466286, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4527409076690674, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7285359501838684, + "num_tokens": 272987430.0, + "step": 10547 + }, + { + "epoch": 1.1583571271688997, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.6799731254577637, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7230991125106812, + "num_tokens": 273008216.0, + "step": 10548 + }, + { + "epoch": 1.1584669448715132, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5851035118103027, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.7019754648208618, + "num_tokens": 273031222.0, + "step": 10549 + }, + { + "epoch": 1.158576762574127, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.5485076904296875, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7221328020095825, + "num_tokens": 273053142.0, + "step": 10550 + }, + { + "epoch": 1.1586865802767405, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.379100799560547, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6965602040290833, + "num_tokens": 273080271.0, + "step": 10551 + }, + { + "epoch": 1.1587963979793543, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4001004695892334, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7172117829322815, + "num_tokens": 273103267.0, + "step": 10552 + }, + { + "epoch": 1.1589062156819678, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3496932983398438, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7073113918304443, + "num_tokens": 273129129.0, + "step": 10553 + }, + { + "epoch": 1.1590160333845816, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.162412405014038, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7233160138130188, + "num_tokens": 273159544.0, + "step": 10554 + }, + { + "epoch": 1.1591258510871953, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2660574913024902, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7139812707901001, + "num_tokens": 273186077.0, + "step": 10555 + }, + { + "epoch": 1.1592356687898089, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.3173036575317383, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7113693356513977, + "num_tokens": 273213808.0, + "step": 10556 + }, + { + "epoch": 1.1593454864924226, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.321248769760132, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7198790907859802, + "num_tokens": 273242014.0, + "step": 10557 + }, + { + "epoch": 1.1594553041950362, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1435723304748535, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7021780014038086, + "num_tokens": 273272011.0, + "step": 10558 + }, + { + "epoch": 1.15956512189765, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4624407291412354, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6884206533432007, + "num_tokens": 273296193.0, + "step": 10559 + }, + { + "epoch": 1.1596749396002635, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2749669551849365, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7114081382751465, + "num_tokens": 273324154.0, + "step": 10560 + }, + { + "epoch": 1.1597847573028772, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.172896146774292, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6951115131378174, + "num_tokens": 273356852.0, + "step": 10561 + }, + { + "epoch": 1.159894575005491, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3397724628448486, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6944112777709961, + "num_tokens": 273386090.0, + "step": 10562 + }, + { + "epoch": 1.1600043927081045, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.207109212875366, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7075166702270508, + "num_tokens": 273416319.0, + "step": 10563 + }, + { + "epoch": 1.1601142104107183, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.544541597366333, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.7002770900726318, + "num_tokens": 273439730.0, + "step": 10564 + }, + { + "epoch": 1.1602240281133318, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.460261344909668, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6922942399978638, + "num_tokens": 273465595.0, + "step": 10565 + }, + { + "epoch": 1.1603338458159456, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.282489061355591, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.676313579082489, + "num_tokens": 273496619.0, + "step": 10566 + }, + { + "epoch": 1.160443663518559, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.751429319381714, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.725059449672699, + "num_tokens": 273517232.0, + "step": 10567 + }, + { + "epoch": 1.1605534812211729, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.463869333267212, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.7039825916290283, + "num_tokens": 273543123.0, + "step": 10568 + }, + { + "epoch": 1.1606632989237866, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.1875617504119873, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.709025502204895, + "num_tokens": 273573824.0, + "step": 10569 + }, + { + "epoch": 1.1607731166264001, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.5617377758026123, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7246389389038086, + "num_tokens": 273596774.0, + "step": 10570 + }, + { + "epoch": 1.160882934329014, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.1552670001983643, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7124637365341187, + "num_tokens": 273625480.0, + "step": 10571 + }, + { + "epoch": 1.1609927520316274, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.3454463481903076, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7131436467170715, + "num_tokens": 273650325.0, + "step": 10572 + }, + { + "epoch": 1.1611025697342412, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.4694411754608154, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6903319358825684, + "num_tokens": 273673384.0, + "step": 10573 + }, + { + "epoch": 1.1612123874368547, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.498563051223755, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.7112919092178345, + "num_tokens": 273696548.0, + "step": 10574 + }, + { + "epoch": 1.1613222051394685, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.225520610809326, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.70500248670578, + "num_tokens": 273725958.0, + "step": 10575 + }, + { + "epoch": 1.161432022842082, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.150716781616211, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6990739107131958, + "num_tokens": 273758568.0, + "step": 10576 + }, + { + "epoch": 1.1615418405446958, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.291606903076172, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7442140579223633, + "num_tokens": 273786322.0, + "step": 10577 + }, + { + "epoch": 1.1616516582473095, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3225581645965576, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7116564512252808, + "num_tokens": 273812749.0, + "step": 10578 + }, + { + "epoch": 1.161761475949923, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3665425777435303, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7212544679641724, + "num_tokens": 273836812.0, + "step": 10579 + }, + { + "epoch": 1.1618712936525368, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4397330284118652, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6828023195266724, + "num_tokens": 273861639.0, + "step": 10580 + }, + { + "epoch": 1.1619811113551504, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 2.241720676422119, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7098890542984009, + "num_tokens": 273890785.0, + "step": 10581 + }, + { + "epoch": 1.1620909290577641, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2309675216674805, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7033935785293579, + "num_tokens": 273920468.0, + "step": 10582 + }, + { + "epoch": 1.1622007467603779, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3349449634552, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7329609394073486, + "num_tokens": 273946197.0, + "step": 10583 + }, + { + "epoch": 1.1623105644629914, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2800021171569824, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7045124769210815, + "num_tokens": 273974399.0, + "step": 10584 + }, + { + "epoch": 1.1624203821656052, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3434293270111084, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7177761793136597, + "num_tokens": 274001424.0, + "step": 10585 + }, + { + "epoch": 1.1625301998682187, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6357126235961914, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6962298154830933, + "num_tokens": 274024276.0, + "step": 10586 + }, + { + "epoch": 1.1626400175708325, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4881210327148438, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6995705366134644, + "num_tokens": 274048851.0, + "step": 10587 + }, + { + "epoch": 1.162749835273446, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5350594520568848, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7084877490997314, + "num_tokens": 274071445.0, + "step": 10588 + }, + { + "epoch": 1.1628596529760598, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2400944232940674, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7528355121612549, + "num_tokens": 274098455.0, + "step": 10589 + }, + { + "epoch": 1.1629694706786733, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.373228073120117, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.734328031539917, + "num_tokens": 274124178.0, + "step": 10590 + }, + { + "epoch": 1.163079288381287, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.372657299041748, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7104774713516235, + "num_tokens": 274151069.0, + "step": 10591 + }, + { + "epoch": 1.1631891060839008, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.524066209793091, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7007169723510742, + "num_tokens": 274173561.0, + "step": 10592 + }, + { + "epoch": 1.1632989237865143, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.1558873653411865, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6767369508743286, + "num_tokens": 274205144.0, + "step": 10593 + }, + { + "epoch": 1.163408741489128, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4876222610473633, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6918529272079468, + "num_tokens": 274231929.0, + "step": 10594 + }, + { + "epoch": 1.1635185591917416, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.467417001724243, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7292279005050659, + "num_tokens": 274256158.0, + "step": 10595 + }, + { + "epoch": 1.1636283768943554, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.455652952194214, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6925179958343506, + "num_tokens": 274283026.0, + "step": 10596 + }, + { + "epoch": 1.1637381945969691, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.395005702972412, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6928368806838989, + "num_tokens": 274309775.0, + "step": 10597 + }, + { + "epoch": 1.1638480122995827, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.475062847137451, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.711124062538147, + "num_tokens": 274334540.0, + "step": 10598 + }, + { + "epoch": 1.1639578300021964, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.583925485610962, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7092914581298828, + "num_tokens": 274357535.0, + "step": 10599 + }, + { + "epoch": 1.16406764770481, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.380878210067749, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7121273279190063, + "num_tokens": 274382477.0, + "step": 10600 + }, + { + "epoch": 1.1641774654074237, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.383770227432251, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7216728925704956, + "num_tokens": 274408585.0, + "step": 10601 + }, + { + "epoch": 1.1642872831100373, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.188668966293335, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7410962581634521, + "num_tokens": 274437220.0, + "step": 10602 + }, + { + "epoch": 1.164397100812651, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.215442180633545, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7092699408531189, + "num_tokens": 274469802.0, + "step": 10603 + }, + { + "epoch": 1.1645069185152646, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2325103282928467, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7062914371490479, + "num_tokens": 274499495.0, + "step": 10604 + }, + { + "epoch": 1.1646167362178783, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.4316213130950928, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7223609685897827, + "num_tokens": 274522048.0, + "step": 10605 + }, + { + "epoch": 1.164726553920492, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.38999342918396, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6900434494018555, + "num_tokens": 274550397.0, + "step": 10606 + }, + { + "epoch": 1.1648363716231056, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4493894577026367, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7177116274833679, + "num_tokens": 274572289.0, + "step": 10607 + }, + { + "epoch": 1.1649461893257194, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.409939765930176, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7004584670066833, + "num_tokens": 274598271.0, + "step": 10608 + }, + { + "epoch": 1.165056007028333, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.16377592086792, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7123919725418091, + "num_tokens": 274626039.0, + "step": 10609 + }, + { + "epoch": 1.1651658247309467, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.27970814704895, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6973122358322144, + "num_tokens": 274657055.0, + "step": 10610 + }, + { + "epoch": 1.1652756424335604, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2982866764068604, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.716044545173645, + "num_tokens": 274685455.0, + "step": 10611 + }, + { + "epoch": 1.165385460136174, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.494310140609741, + "learning_rate": 1e-06, + "loss": 1.0997, + "mean_token_accuracy": 0.6882688403129578, + "num_tokens": 274709761.0, + "step": 10612 + }, + { + "epoch": 1.1654952778387877, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4531383514404297, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7155924439430237, + "num_tokens": 274734396.0, + "step": 10613 + }, + { + "epoch": 1.1656050955414012, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.117814540863037, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7050936222076416, + "num_tokens": 274768069.0, + "step": 10614 + }, + { + "epoch": 1.165714913244015, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.104008436203003, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7162692546844482, + "num_tokens": 274800029.0, + "step": 10615 + }, + { + "epoch": 1.1658247309466285, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.8818681240081787, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7197808623313904, + "num_tokens": 274819886.0, + "step": 10616 + }, + { + "epoch": 1.1659345486492423, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3319473266601562, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7008653283119202, + "num_tokens": 274848395.0, + "step": 10617 + }, + { + "epoch": 1.1660443663518558, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3248496055603027, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7114925384521484, + "num_tokens": 274872749.0, + "step": 10618 + }, + { + "epoch": 1.1661541840544696, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.296029567718506, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7226412296295166, + "num_tokens": 274897661.0, + "step": 10619 + }, + { + "epoch": 1.1662640017570833, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.364732027053833, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.697953999042511, + "num_tokens": 274922900.0, + "step": 10620 + }, + { + "epoch": 1.1663738194596969, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.141660451889038, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.71126788854599, + "num_tokens": 274953199.0, + "step": 10621 + }, + { + "epoch": 1.1664836371623106, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6326279640197754, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7190810441970825, + "num_tokens": 274975312.0, + "step": 10622 + }, + { + "epoch": 1.1665934548649242, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.248417377471924, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6893190145492554, + "num_tokens": 275005756.0, + "step": 10623 + }, + { + "epoch": 1.166703272567538, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5098702907562256, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7199324369430542, + "num_tokens": 275029331.0, + "step": 10624 + }, + { + "epoch": 1.1668130902701515, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6668715476989746, + "learning_rate": 1e-06, + "loss": 1.1344, + "mean_token_accuracy": 0.6717652082443237, + "num_tokens": 275054582.0, + "step": 10625 + }, + { + "epoch": 1.1669229079727652, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4329538345336914, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.73293137550354, + "num_tokens": 275079544.0, + "step": 10626 + }, + { + "epoch": 1.167032725675379, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3680458068847656, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.7009903788566589, + "num_tokens": 275109495.0, + "step": 10627 + }, + { + "epoch": 1.1671425433779925, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.475661039352417, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7084027528762817, + "num_tokens": 275132812.0, + "step": 10628 + }, + { + "epoch": 1.1672523610806063, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.266225814819336, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6870317459106445, + "num_tokens": 275161256.0, + "step": 10629 + }, + { + "epoch": 1.1673621787832198, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.364851474761963, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.720727801322937, + "num_tokens": 275186318.0, + "step": 10630 + }, + { + "epoch": 1.1674719964858336, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6174378395080566, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.690049946308136, + "num_tokens": 275207258.0, + "step": 10631 + }, + { + "epoch": 1.167581814188447, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.435396671295166, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7093227505683899, + "num_tokens": 275233345.0, + "step": 10632 + }, + { + "epoch": 1.1676916318910608, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4478657245635986, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7240182757377625, + "num_tokens": 275258457.0, + "step": 10633 + }, + { + "epoch": 1.1678014495936746, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.7550878524780273, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7246242761611938, + "num_tokens": 275278010.0, + "step": 10634 + }, + { + "epoch": 1.1679112672962881, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.465886354446411, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7164486646652222, + "num_tokens": 275304279.0, + "step": 10635 + }, + { + "epoch": 1.168021084998902, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6663408279418945, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7090965509414673, + "num_tokens": 275326567.0, + "step": 10636 + }, + { + "epoch": 1.1681309027015154, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.0371687412261963, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6817111372947693, + "num_tokens": 275363466.0, + "step": 10637 + }, + { + "epoch": 1.1682407204041292, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 4.0340495109558105, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7259582877159119, + "num_tokens": 275383405.0, + "step": 10638 + }, + { + "epoch": 1.1683505381067427, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2132668495178223, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7207942008972168, + "num_tokens": 275410995.0, + "step": 10639 + }, + { + "epoch": 1.1684603558093565, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.171412467956543, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6878400444984436, + "num_tokens": 275442348.0, + "step": 10640 + }, + { + "epoch": 1.16857017351197, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.156867265701294, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.699874997138977, + "num_tokens": 275473597.0, + "step": 10641 + }, + { + "epoch": 1.1686799912145838, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.1778664588928223, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6921695470809937, + "num_tokens": 275505412.0, + "step": 10642 + }, + { + "epoch": 1.1687898089171975, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5804104804992676, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7066321969032288, + "num_tokens": 275528121.0, + "step": 10643 + }, + { + "epoch": 1.168899626619811, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4356470108032227, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7317553758621216, + "num_tokens": 275549898.0, + "step": 10644 + }, + { + "epoch": 1.1690094443224248, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2168381214141846, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6854325532913208, + "num_tokens": 275579992.0, + "step": 10645 + }, + { + "epoch": 1.1691192620250384, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.587529182434082, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7036175727844238, + "num_tokens": 275602864.0, + "step": 10646 + }, + { + "epoch": 1.169229079727652, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.1721699237823486, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7229743599891663, + "num_tokens": 275631977.0, + "step": 10647 + }, + { + "epoch": 1.1693388974302659, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.295100450515747, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7153003215789795, + "num_tokens": 275661055.0, + "step": 10648 + }, + { + "epoch": 1.1694487151328794, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3334479331970215, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7250564098358154, + "num_tokens": 275686776.0, + "step": 10649 + }, + { + "epoch": 1.1695585328354932, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.204855442047119, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7361972332000732, + "num_tokens": 275715084.0, + "step": 10650 + }, + { + "epoch": 1.1696683505381067, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.129818916320801, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6874611377716064, + "num_tokens": 275748446.0, + "step": 10651 + }, + { + "epoch": 1.1697781682407205, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.652728796005249, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6985338926315308, + "num_tokens": 275770040.0, + "step": 10652 + }, + { + "epoch": 1.169887985943334, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5952770709991455, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7137036919593811, + "num_tokens": 275793328.0, + "step": 10653 + }, + { + "epoch": 1.1699978036459477, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.236912727355957, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7024938464164734, + "num_tokens": 275823134.0, + "step": 10654 + }, + { + "epoch": 1.1701076213485613, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 3.8528947830200195, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6962283849716187, + "num_tokens": 275851797.0, + "step": 10655 + }, + { + "epoch": 1.170217439051175, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5470876693725586, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7091948986053467, + "num_tokens": 275874996.0, + "step": 10656 + }, + { + "epoch": 1.1703272567537888, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3189003467559814, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7035925388336182, + "num_tokens": 275901105.0, + "step": 10657 + }, + { + "epoch": 1.1704370744564023, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3481929302215576, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7063571214675903, + "num_tokens": 275928391.0, + "step": 10658 + }, + { + "epoch": 1.170546892159016, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3839471340179443, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7133300304412842, + "num_tokens": 275955151.0, + "step": 10659 + }, + { + "epoch": 1.1706567098616296, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5725324153900146, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7097876667976379, + "num_tokens": 275977107.0, + "step": 10660 + }, + { + "epoch": 1.1707665275642434, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2417378425598145, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.679999589920044, + "num_tokens": 276006547.0, + "step": 10661 + }, + { + "epoch": 1.1708763452668571, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.186279058456421, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.709804117679596, + "num_tokens": 276034604.0, + "step": 10662 + }, + { + "epoch": 1.1709861629694707, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3825507164001465, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6946325302124023, + "num_tokens": 276062945.0, + "step": 10663 + }, + { + "epoch": 1.1710959806720844, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.778672218322754, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7371094226837158, + "num_tokens": 276081219.0, + "step": 10664 + }, + { + "epoch": 1.171205798374698, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.470309257507324, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6961351037025452, + "num_tokens": 276105780.0, + "step": 10665 + }, + { + "epoch": 1.1713156160773117, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.464888095855713, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7266726493835449, + "num_tokens": 276127673.0, + "step": 10666 + }, + { + "epoch": 1.1714254337799253, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.238710641860962, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7094505429267883, + "num_tokens": 276155202.0, + "step": 10667 + }, + { + "epoch": 1.171535251482539, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.382789373397827, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7009817957878113, + "num_tokens": 276182164.0, + "step": 10668 + }, + { + "epoch": 1.1716450691851525, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.135676860809326, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7252630591392517, + "num_tokens": 276210888.0, + "step": 10669 + }, + { + "epoch": 1.1717548868877663, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.206378698348999, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7189183235168457, + "num_tokens": 276238745.0, + "step": 10670 + }, + { + "epoch": 1.17186470459038, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4774880409240723, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.717888593673706, + "num_tokens": 276262017.0, + "step": 10671 + }, + { + "epoch": 1.1719745222929936, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2589619159698486, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.702778697013855, + "num_tokens": 276291389.0, + "step": 10672 + }, + { + "epoch": 1.1720843399956073, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.548859119415283, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7169098854064941, + "num_tokens": 276314507.0, + "step": 10673 + }, + { + "epoch": 1.1721941576982209, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.267045259475708, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7099413871765137, + "num_tokens": 276341334.0, + "step": 10674 + }, + { + "epoch": 1.1723039754008346, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3009016513824463, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7046048641204834, + "num_tokens": 276366493.0, + "step": 10675 + }, + { + "epoch": 1.1724137931034484, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.575862169265747, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7124549150466919, + "num_tokens": 276388674.0, + "step": 10676 + }, + { + "epoch": 1.172523610806062, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3889763355255127, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7019058465957642, + "num_tokens": 276412824.0, + "step": 10677 + }, + { + "epoch": 1.1726334285086757, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.406829833984375, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7165175080299377, + "num_tokens": 276440248.0, + "step": 10678 + }, + { + "epoch": 1.1727432462112892, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3267288208007812, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7162032127380371, + "num_tokens": 276466961.0, + "step": 10679 + }, + { + "epoch": 1.172853063913903, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6062562465667725, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7367432713508606, + "num_tokens": 276488568.0, + "step": 10680 + }, + { + "epoch": 1.1729628816165165, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.35760498046875, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7160379886627197, + "num_tokens": 276514628.0, + "step": 10681 + }, + { + "epoch": 1.1730726993191303, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.118055820465088, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7074428796768188, + "num_tokens": 276546001.0, + "step": 10682 + }, + { + "epoch": 1.1731825170217438, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4804506301879883, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.717876672744751, + "num_tokens": 276567144.0, + "step": 10683 + }, + { + "epoch": 1.1732923347243576, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.088566541671753, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6972543001174927, + "num_tokens": 276600189.0, + "step": 10684 + }, + { + "epoch": 1.1734021524269713, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4220240116119385, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7131797671318054, + "num_tokens": 276626814.0, + "step": 10685 + }, + { + "epoch": 1.1735119701295849, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2159574031829834, + "learning_rate": 1e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.6762157678604126, + "num_tokens": 276660157.0, + "step": 10686 + }, + { + "epoch": 1.1736217878321986, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.431081533432007, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7021385431289673, + "num_tokens": 276687209.0, + "step": 10687 + }, + { + "epoch": 1.1737316055348122, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.056858777999878, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7160956859588623, + "num_tokens": 276719689.0, + "step": 10688 + }, + { + "epoch": 1.173841423237426, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.1332755088806152, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7145876288414001, + "num_tokens": 276750718.0, + "step": 10689 + }, + { + "epoch": 1.1739512409400394, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.494434356689453, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7066813707351685, + "num_tokens": 276775344.0, + "step": 10690 + }, + { + "epoch": 1.1740610586426532, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4453375339508057, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6851398944854736, + "num_tokens": 276802480.0, + "step": 10691 + }, + { + "epoch": 1.1741708763452667, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5111992359161377, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7374362349510193, + "num_tokens": 276825093.0, + "step": 10692 + }, + { + "epoch": 1.1742806940478805, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.282620429992676, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6819757223129272, + "num_tokens": 276859244.0, + "step": 10693 + }, + { + "epoch": 1.1743905117504942, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.308170795440674, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7134590148925781, + "num_tokens": 276883396.0, + "step": 10694 + }, + { + "epoch": 1.1745003294531078, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3160133361816406, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7248755693435669, + "num_tokens": 276910340.0, + "step": 10695 + }, + { + "epoch": 1.1746101471557215, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6034748554229736, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7192351818084717, + "num_tokens": 276935607.0, + "step": 10696 + }, + { + "epoch": 1.174719964858335, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6185150146484375, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7079007625579834, + "num_tokens": 276958407.0, + "step": 10697 + }, + { + "epoch": 1.1748297825609488, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.339094400405884, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7049061059951782, + "num_tokens": 276984274.0, + "step": 10698 + }, + { + "epoch": 1.1749396002635626, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.64978289604187, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7324874401092529, + "num_tokens": 277003924.0, + "step": 10699 + }, + { + "epoch": 1.1750494179661761, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4234824180603027, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7010326385498047, + "num_tokens": 277029326.0, + "step": 10700 + }, + { + "epoch": 1.1751592356687899, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4395751953125, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7404006719589233, + "num_tokens": 277052982.0, + "step": 10701 + }, + { + "epoch": 1.1752690533714034, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.239015579223633, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7149950265884399, + "num_tokens": 277082856.0, + "step": 10702 + }, + { + "epoch": 1.1753788710740172, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.332491874694824, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6872957348823547, + "num_tokens": 277110580.0, + "step": 10703 + }, + { + "epoch": 1.1754886887766307, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.53497052192688, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.713335394859314, + "num_tokens": 277133087.0, + "step": 10704 + }, + { + "epoch": 1.1755985064792445, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.435781240463257, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.710148811340332, + "num_tokens": 277157403.0, + "step": 10705 + }, + { + "epoch": 1.175708324181858, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.334528923034668, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7051492929458618, + "num_tokens": 277183739.0, + "step": 10706 + }, + { + "epoch": 1.1758181418844718, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2905113697052, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6992783546447754, + "num_tokens": 277210821.0, + "step": 10707 + }, + { + "epoch": 1.1759279595870855, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2537596225738525, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7220334410667419, + "num_tokens": 277236725.0, + "step": 10708 + }, + { + "epoch": 1.176037777289699, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.222651720046997, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.703150749206543, + "num_tokens": 277268483.0, + "step": 10709 + }, + { + "epoch": 1.1761475949923128, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.291184902191162, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7195581793785095, + "num_tokens": 277296117.0, + "step": 10710 + }, + { + "epoch": 1.1762574126949263, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.556955575942993, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7141338586807251, + "num_tokens": 277318543.0, + "step": 10711 + }, + { + "epoch": 1.17636723039754, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.240267038345337, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7026659250259399, + "num_tokens": 277347866.0, + "step": 10712 + }, + { + "epoch": 1.1764770481001539, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2897539138793945, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6868360042572021, + "num_tokens": 277376341.0, + "step": 10713 + }, + { + "epoch": 1.1765868658027674, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.251157522201538, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7231354713439941, + "num_tokens": 277403427.0, + "step": 10714 + }, + { + "epoch": 1.1766966835053811, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.2335596084594727, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.717432975769043, + "num_tokens": 277430711.0, + "step": 10715 + }, + { + "epoch": 1.1768065012079947, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.1717796325683594, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6894789934158325, + "num_tokens": 277460092.0, + "step": 10716 + }, + { + "epoch": 1.1769163189106084, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.255819797515869, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6922136545181274, + "num_tokens": 277486876.0, + "step": 10717 + }, + { + "epoch": 1.177026136613222, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.4850423336029053, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.704344630241394, + "num_tokens": 277510917.0, + "step": 10718 + }, + { + "epoch": 1.1771359543158357, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.310051441192627, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6998170614242554, + "num_tokens": 277537077.0, + "step": 10719 + }, + { + "epoch": 1.1772457720184493, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.6042981147766113, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7125986814498901, + "num_tokens": 277559338.0, + "step": 10720 + }, + { + "epoch": 1.177355589721063, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.793517827987671, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7271636724472046, + "num_tokens": 277580893.0, + "step": 10721 + }, + { + "epoch": 1.1774654074236768, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.648022174835205, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7022017240524292, + "num_tokens": 277602487.0, + "step": 10722 + }, + { + "epoch": 1.1775752251262903, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.3122026920318604, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.70267653465271, + "num_tokens": 277633982.0, + "step": 10723 + }, + { + "epoch": 1.177685042828904, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.6567256450653076, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7190026044845581, + "num_tokens": 277656418.0, + "step": 10724 + }, + { + "epoch": 1.1777948605315176, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 2.5094542503356934, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6950736045837402, + "num_tokens": 277681706.0, + "step": 10725 + }, + { + "epoch": 1.1779046782341314, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.542343854904175, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.722320556640625, + "num_tokens": 277704022.0, + "step": 10726 + }, + { + "epoch": 1.1780144959367451, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.78528094291687, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7411059141159058, + "num_tokens": 277722483.0, + "step": 10727 + }, + { + "epoch": 1.1781243136393587, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.6060149669647217, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7147260904312134, + "num_tokens": 277744425.0, + "step": 10728 + }, + { + "epoch": 1.1782341313419724, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.8793790340423584, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7207717299461365, + "num_tokens": 277762468.0, + "step": 10729 + }, + { + "epoch": 1.178343949044586, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4318909645080566, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6988489627838135, + "num_tokens": 277789603.0, + "step": 10730 + }, + { + "epoch": 1.1784537667471997, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3459088802337646, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6906669735908508, + "num_tokens": 277816454.0, + "step": 10731 + }, + { + "epoch": 1.1785635844498132, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.285684823989868, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7285230755805969, + "num_tokens": 277844616.0, + "step": 10732 + }, + { + "epoch": 1.178673402152427, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.439143180847168, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.701278030872345, + "num_tokens": 277870836.0, + "step": 10733 + }, + { + "epoch": 1.1787832198550405, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.45550537109375, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7185250520706177, + "num_tokens": 277894579.0, + "step": 10734 + }, + { + "epoch": 1.1788930375576543, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2684662342071533, + "learning_rate": 1e-06, + "loss": 1.1006, + "mean_token_accuracy": 0.6796606779098511, + "num_tokens": 277924162.0, + "step": 10735 + }, + { + "epoch": 1.179002855260268, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.191659688949585, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7114126682281494, + "num_tokens": 277952941.0, + "step": 10736 + }, + { + "epoch": 1.1791126729628816, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.449272394180298, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6964288949966431, + "num_tokens": 277977943.0, + "step": 10737 + }, + { + "epoch": 1.1792224906654953, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5097880363464355, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6992955207824707, + "num_tokens": 278000702.0, + "step": 10738 + }, + { + "epoch": 1.1793323083681089, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.6005399227142334, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7128656506538391, + "num_tokens": 278023195.0, + "step": 10739 + }, + { + "epoch": 1.1794421260707226, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3248984813690186, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7187593579292297, + "num_tokens": 278051203.0, + "step": 10740 + }, + { + "epoch": 1.1795519437733364, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.332807779312134, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.715053141117096, + "num_tokens": 278076421.0, + "step": 10741 + }, + { + "epoch": 1.17966176147595, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.513305187225342, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7022432088851929, + "num_tokens": 278101100.0, + "step": 10742 + }, + { + "epoch": 1.1797715791785637, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4776506423950195, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6954835653305054, + "num_tokens": 278126360.0, + "step": 10743 + }, + { + "epoch": 1.1798813968811772, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2521018981933594, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7147994041442871, + "num_tokens": 278152646.0, + "step": 10744 + }, + { + "epoch": 1.179991214583791, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5804178714752197, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7007360458374023, + "num_tokens": 278174769.0, + "step": 10745 + }, + { + "epoch": 1.1801010322864045, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.234525680541992, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7163031101226807, + "num_tokens": 278203501.0, + "step": 10746 + }, + { + "epoch": 1.1802108499890183, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3281891345977783, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.723564624786377, + "num_tokens": 278230519.0, + "step": 10747 + }, + { + "epoch": 1.1803206676916318, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.353139877319336, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7226619720458984, + "num_tokens": 278256524.0, + "step": 10748 + }, + { + "epoch": 1.1804304853942456, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5841476917266846, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7259548902511597, + "num_tokens": 278278805.0, + "step": 10749 + }, + { + "epoch": 1.1805403030968593, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.286686897277832, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7077839970588684, + "num_tokens": 278307389.0, + "step": 10750 + }, + { + "epoch": 1.1806501207994728, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4056198596954346, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6997430324554443, + "num_tokens": 278332455.0, + "step": 10751 + }, + { + "epoch": 1.1807599385020866, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 3.7330312728881836, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7069542407989502, + "num_tokens": 278362515.0, + "step": 10752 + }, + { + "epoch": 1.1808697562047001, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.439507484436035, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7289712429046631, + "num_tokens": 278388616.0, + "step": 10753 + }, + { + "epoch": 1.180979573907314, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.6226084232330322, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7014003992080688, + "num_tokens": 278414433.0, + "step": 10754 + }, + { + "epoch": 1.1810893916099274, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5706655979156494, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7229489088058472, + "num_tokens": 278435896.0, + "step": 10755 + }, + { + "epoch": 1.1811992093125412, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5254709720611572, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7325583696365356, + "num_tokens": 278457544.0, + "step": 10756 + }, + { + "epoch": 1.1813090270151547, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3383946418762207, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.695127546787262, + "num_tokens": 278483248.0, + "step": 10757 + }, + { + "epoch": 1.1814188447177685, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.501284599304199, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.7036648988723755, + "num_tokens": 278506615.0, + "step": 10758 + }, + { + "epoch": 1.1815286624203822, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3662285804748535, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7049776315689087, + "num_tokens": 278533229.0, + "step": 10759 + }, + { + "epoch": 1.1816384801229958, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.297065496444702, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7176706790924072, + "num_tokens": 278559948.0, + "step": 10760 + }, + { + "epoch": 1.1817482978256095, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4479475021362305, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.709444522857666, + "num_tokens": 278583103.0, + "step": 10761 + }, + { + "epoch": 1.181858115528223, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.469334363937378, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7161530256271362, + "num_tokens": 278610819.0, + "step": 10762 + }, + { + "epoch": 1.1819679332308368, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.259371519088745, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7348529100418091, + "num_tokens": 278639297.0, + "step": 10763 + }, + { + "epoch": 1.1820777509334506, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2824833393096924, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6954660415649414, + "num_tokens": 278668029.0, + "step": 10764 + }, + { + "epoch": 1.1821875686360641, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 3.1883575916290283, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7106121778488159, + "num_tokens": 278684837.0, + "step": 10765 + }, + { + "epoch": 1.1822973863386779, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.313830614089966, + "learning_rate": 1e-06, + "loss": 1.1312, + "mean_token_accuracy": 0.6711750626564026, + "num_tokens": 278716488.0, + "step": 10766 + }, + { + "epoch": 1.1824072040412914, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.64456844329834, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7054550647735596, + "num_tokens": 278739057.0, + "step": 10767 + }, + { + "epoch": 1.1825170217439052, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.7672460079193115, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7233625650405884, + "num_tokens": 278757143.0, + "step": 10768 + }, + { + "epoch": 1.1826268394465187, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.17329478263855, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7158973813056946, + "num_tokens": 278788023.0, + "step": 10769 + }, + { + "epoch": 1.1827366571491325, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.473320245742798, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.71209716796875, + "num_tokens": 278811875.0, + "step": 10770 + }, + { + "epoch": 1.182846474851746, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.500751256942749, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.698686957359314, + "num_tokens": 278837201.0, + "step": 10771 + }, + { + "epoch": 1.1829562925543597, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.763016700744629, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7292816638946533, + "num_tokens": 278856995.0, + "step": 10772 + }, + { + "epoch": 1.1830661102569735, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5908851623535156, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7065321207046509, + "num_tokens": 278879410.0, + "step": 10773 + }, + { + "epoch": 1.183175927959587, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5408875942230225, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7021938562393188, + "num_tokens": 278902806.0, + "step": 10774 + }, + { + "epoch": 1.1832857456622008, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.47796893119812, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6970699429512024, + "num_tokens": 278928228.0, + "step": 10775 + }, + { + "epoch": 1.1833955633648143, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.329220771789551, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6903386116027832, + "num_tokens": 278956107.0, + "step": 10776 + }, + { + "epoch": 1.183505381067428, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.355435848236084, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7306886315345764, + "num_tokens": 278980976.0, + "step": 10777 + }, + { + "epoch": 1.1836151987700418, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5037503242492676, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7129780054092407, + "num_tokens": 279006438.0, + "step": 10778 + }, + { + "epoch": 1.1837250164726554, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.944066047668457, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7060241103172302, + "num_tokens": 279025046.0, + "step": 10779 + }, + { + "epoch": 1.1838348341752691, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2320590019226074, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7110226154327393, + "num_tokens": 279054000.0, + "step": 10780 + }, + { + "epoch": 1.1839446518778827, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.1325807571411133, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6861571669578552, + "num_tokens": 279086191.0, + "step": 10781 + }, + { + "epoch": 1.1840544695804964, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.6672613620758057, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7180495858192444, + "num_tokens": 279105947.0, + "step": 10782 + }, + { + "epoch": 1.18416428728311, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.390397071838379, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7281374931335449, + "num_tokens": 279132323.0, + "step": 10783 + }, + { + "epoch": 1.1842741049857237, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.512592077255249, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6993233561515808, + "num_tokens": 279157452.0, + "step": 10784 + }, + { + "epoch": 1.1843839226883373, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3019025325775146, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7084834575653076, + "num_tokens": 279185069.0, + "step": 10785 + }, + { + "epoch": 1.184493740390951, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3833680152893066, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7199664115905762, + "num_tokens": 279212124.0, + "step": 10786 + }, + { + "epoch": 1.1846035580935648, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3975815773010254, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7139286994934082, + "num_tokens": 279239591.0, + "step": 10787 + }, + { + "epoch": 1.1847133757961783, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.1762073040008545, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7198067903518677, + "num_tokens": 279269866.0, + "step": 10788 + }, + { + "epoch": 1.184823193498792, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5041303634643555, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7038661241531372, + "num_tokens": 279293521.0, + "step": 10789 + }, + { + "epoch": 1.1849330112014056, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.1192612648010254, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7221908569335938, + "num_tokens": 279326470.0, + "step": 10790 + }, + { + "epoch": 1.1850428289040194, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4655776023864746, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7419148683547974, + "num_tokens": 279351307.0, + "step": 10791 + }, + { + "epoch": 1.185152646606633, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3100850582122803, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6892021298408508, + "num_tokens": 279379656.0, + "step": 10792 + }, + { + "epoch": 1.1852624643092466, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.548370599746704, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7222353219985962, + "num_tokens": 279401365.0, + "step": 10793 + }, + { + "epoch": 1.1853722820118604, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.141998529434204, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6940069198608398, + "num_tokens": 279431184.0, + "step": 10794 + }, + { + "epoch": 1.185482099714474, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.468092441558838, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.7031388282775879, + "num_tokens": 279455386.0, + "step": 10795 + }, + { + "epoch": 1.1855919174170877, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.395730972290039, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7082016468048096, + "num_tokens": 279480157.0, + "step": 10796 + }, + { + "epoch": 1.1857017351197012, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.079540967941284, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7154064178466797, + "num_tokens": 279511184.0, + "step": 10797 + }, + { + "epoch": 1.185811552822315, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.453385829925537, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7097345590591431, + "num_tokens": 279536340.0, + "step": 10798 + }, + { + "epoch": 1.1859213705249285, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.149308919906616, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7180182933807373, + "num_tokens": 279568355.0, + "step": 10799 + }, + { + "epoch": 1.1860311882275423, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4405839443206787, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7093147039413452, + "num_tokens": 279592058.0, + "step": 10800 + }, + { + "epoch": 1.186141005930156, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.464942216873169, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6978507041931152, + "num_tokens": 279616851.0, + "step": 10801 + }, + { + "epoch": 1.1862508236327696, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3832809925079346, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7107504606246948, + "num_tokens": 279643692.0, + "step": 10802 + }, + { + "epoch": 1.1863606413353833, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.376328229904175, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7132576704025269, + "num_tokens": 279668872.0, + "step": 10803 + }, + { + "epoch": 1.1864704590379969, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.4862678050994873, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7126486301422119, + "num_tokens": 279695404.0, + "step": 10804 + }, + { + "epoch": 1.1865802767406106, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.492630958557129, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7189191579818726, + "num_tokens": 279718073.0, + "step": 10805 + }, + { + "epoch": 1.1866900944432242, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3571975231170654, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7031883001327515, + "num_tokens": 279747779.0, + "step": 10806 + }, + { + "epoch": 1.186799912145838, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.100672483444214, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6977748870849609, + "num_tokens": 279781480.0, + "step": 10807 + }, + { + "epoch": 1.1869097298484517, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2147464752197266, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7094284296035767, + "num_tokens": 279809082.0, + "step": 10808 + }, + { + "epoch": 1.1870195475510652, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.523902416229248, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7204313278198242, + "num_tokens": 279832111.0, + "step": 10809 + }, + { + "epoch": 1.187129365253679, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3054330348968506, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7062228918075562, + "num_tokens": 279860701.0, + "step": 10810 + }, + { + "epoch": 1.1872391829562925, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2096803188323975, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6994739770889282, + "num_tokens": 279891858.0, + "step": 10811 + }, + { + "epoch": 1.1873490006589063, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.7138829231262207, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.718090295791626, + "num_tokens": 279911871.0, + "step": 10812 + }, + { + "epoch": 1.1874588183615198, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.283174514770508, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7079024314880371, + "num_tokens": 279937887.0, + "step": 10813 + }, + { + "epoch": 1.1875686360641335, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.3322982788085938, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7038325071334839, + "num_tokens": 279965585.0, + "step": 10814 + }, + { + "epoch": 1.1876784537667473, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.8349108695983887, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7036266326904297, + "num_tokens": 279985822.0, + "step": 10815 + }, + { + "epoch": 1.1877882714693608, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2044410705566406, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7131300568580627, + "num_tokens": 280016291.0, + "step": 10816 + }, + { + "epoch": 1.1878980891719746, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.375873565673828, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6989347338676453, + "num_tokens": 280045789.0, + "step": 10817 + }, + { + "epoch": 1.1880079068745881, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.5606164932250977, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7315542101860046, + "num_tokens": 280068256.0, + "step": 10818 + }, + { + "epoch": 1.1881177245772019, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.57212495803833, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.743897020816803, + "num_tokens": 280092650.0, + "step": 10819 + }, + { + "epoch": 1.1882275422798154, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6491549015045166, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7078667879104614, + "num_tokens": 280118424.0, + "step": 10820 + }, + { + "epoch": 1.1883373599824292, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5067691802978516, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7091817259788513, + "num_tokens": 280142730.0, + "step": 10821 + }, + { + "epoch": 1.1884471776850427, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.671198606491089, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7003767490386963, + "num_tokens": 280163927.0, + "step": 10822 + }, + { + "epoch": 1.1885569953876565, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.626115560531616, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7135040163993835, + "num_tokens": 280184953.0, + "step": 10823 + }, + { + "epoch": 1.1886668130902702, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.66739559173584, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.702307939529419, + "num_tokens": 280206670.0, + "step": 10824 + }, + { + "epoch": 1.1887766307928838, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2951300144195557, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7164541482925415, + "num_tokens": 280232882.0, + "step": 10825 + }, + { + "epoch": 1.1888864484954975, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.50132155418396, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.721663236618042, + "num_tokens": 280257097.0, + "step": 10826 + }, + { + "epoch": 1.188996266198111, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3423101902008057, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7217757701873779, + "num_tokens": 280283838.0, + "step": 10827 + }, + { + "epoch": 1.1891060839007248, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6838581562042236, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7213848233222961, + "num_tokens": 280305715.0, + "step": 10828 + }, + { + "epoch": 1.1892159016033386, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7703418731689453, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7264925241470337, + "num_tokens": 280323948.0, + "step": 10829 + }, + { + "epoch": 1.189325719305952, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.36798357963562, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7082014083862305, + "num_tokens": 280352256.0, + "step": 10830 + }, + { + "epoch": 1.1894355370085659, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4750823974609375, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6937351226806641, + "num_tokens": 280381460.0, + "step": 10831 + }, + { + "epoch": 1.1895453547111794, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6504104137420654, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7228671908378601, + "num_tokens": 280402295.0, + "step": 10832 + }, + { + "epoch": 1.1896551724137931, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4647376537323, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7447119355201721, + "num_tokens": 280424774.0, + "step": 10833 + }, + { + "epoch": 1.1897649901164067, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7308874130249023, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6933582425117493, + "num_tokens": 280444824.0, + "step": 10834 + }, + { + "epoch": 1.1898748078190204, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6573569774627686, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7048169374465942, + "num_tokens": 280469134.0, + "step": 10835 + }, + { + "epoch": 1.189984625521634, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2296926975250244, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7052057981491089, + "num_tokens": 280499041.0, + "step": 10836 + }, + { + "epoch": 1.1900944432242477, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2923924922943115, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.7017605900764465, + "num_tokens": 280528314.0, + "step": 10837 + }, + { + "epoch": 1.1902042609268615, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.425427198410034, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.7045003175735474, + "num_tokens": 280554192.0, + "step": 10838 + }, + { + "epoch": 1.190314078629475, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.424121141433716, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7154514193534851, + "num_tokens": 280579201.0, + "step": 10839 + }, + { + "epoch": 1.1904238963320888, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.081038475036621, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6884267330169678, + "num_tokens": 280613242.0, + "step": 10840 + }, + { + "epoch": 1.1905337140347023, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.273601770401001, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7156744003295898, + "num_tokens": 280641227.0, + "step": 10841 + }, + { + "epoch": 1.190643531737316, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.142740249633789, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7061290740966797, + "num_tokens": 280671471.0, + "step": 10842 + }, + { + "epoch": 1.1907533494399298, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4563047885894775, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6981050372123718, + "num_tokens": 280695950.0, + "step": 10843 + }, + { + "epoch": 1.1908631671425434, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.78121280670166, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7360812425613403, + "num_tokens": 280714361.0, + "step": 10844 + }, + { + "epoch": 1.1909729848451571, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.8677945137023926, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7387917041778564, + "num_tokens": 280734389.0, + "step": 10845 + }, + { + "epoch": 1.1910828025477707, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3367574214935303, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7247394323348999, + "num_tokens": 280759798.0, + "step": 10846 + }, + { + "epoch": 1.1911926202503844, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6103427410125732, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7213441729545593, + "num_tokens": 280782156.0, + "step": 10847 + }, + { + "epoch": 1.191302437952998, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.527888774871826, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7046060562133789, + "num_tokens": 280807703.0, + "step": 10848 + }, + { + "epoch": 1.1914122556556117, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.477022886276245, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7069690823554993, + "num_tokens": 280831959.0, + "step": 10849 + }, + { + "epoch": 1.1915220733582252, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1040844917297363, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7095518708229065, + "num_tokens": 280864614.0, + "step": 10850 + }, + { + "epoch": 1.191631891060839, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3563201427459717, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6970169544219971, + "num_tokens": 280891174.0, + "step": 10851 + }, + { + "epoch": 1.1917417087634528, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3733012676239014, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7093338370323181, + "num_tokens": 280919838.0, + "step": 10852 + }, + { + "epoch": 1.1918515264660663, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.8366050720214844, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7185803651809692, + "num_tokens": 280938891.0, + "step": 10853 + }, + { + "epoch": 1.19196134416868, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4140384197235107, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7101517915725708, + "num_tokens": 280962915.0, + "step": 10854 + }, + { + "epoch": 1.1920711618712936, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2227706909179688, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6977898478507996, + "num_tokens": 280991722.0, + "step": 10855 + }, + { + "epoch": 1.1921809795739073, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.158115863800049, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6982998847961426, + "num_tokens": 281021599.0, + "step": 10856 + }, + { + "epoch": 1.192290797276521, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1422367095947266, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7180697917938232, + "num_tokens": 281051685.0, + "step": 10857 + }, + { + "epoch": 1.1924006149791346, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.536097764968872, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7068377137184143, + "num_tokens": 281074117.0, + "step": 10858 + }, + { + "epoch": 1.1925104326817484, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.355675458908081, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.700620174407959, + "num_tokens": 281102277.0, + "step": 10859 + }, + { + "epoch": 1.192620250384362, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.409665584564209, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7043824791908264, + "num_tokens": 281128923.0, + "step": 10860 + }, + { + "epoch": 1.1927300680869757, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3246009349823, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7388429045677185, + "num_tokens": 281156396.0, + "step": 10861 + }, + { + "epoch": 1.1928398857895892, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.43886661529541, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7185007333755493, + "num_tokens": 281181795.0, + "step": 10862 + }, + { + "epoch": 1.192949703492203, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.567277193069458, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7038785815238953, + "num_tokens": 281205956.0, + "step": 10863 + }, + { + "epoch": 1.1930595211948165, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3739469051361084, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.703210175037384, + "num_tokens": 281231044.0, + "step": 10864 + }, + { + "epoch": 1.1931693388974303, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.382922649383545, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7277608513832092, + "num_tokens": 281256379.0, + "step": 10865 + }, + { + "epoch": 1.193279156600044, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2043447494506836, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7082335948944092, + "num_tokens": 281286506.0, + "step": 10866 + }, + { + "epoch": 1.1933889743026576, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.323184013366699, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.7050690650939941, + "num_tokens": 281315045.0, + "step": 10867 + }, + { + "epoch": 1.1934987920052713, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.501028299331665, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7166478037834167, + "num_tokens": 281338268.0, + "step": 10868 + }, + { + "epoch": 1.1936086097078848, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.666883707046509, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7271462678909302, + "num_tokens": 281358237.0, + "step": 10869 + }, + { + "epoch": 1.1937184274104986, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5622808933258057, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7303643226623535, + "num_tokens": 281379501.0, + "step": 10870 + }, + { + "epoch": 1.1938282451131121, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.745720624923706, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7174807190895081, + "num_tokens": 281399907.0, + "step": 10871 + }, + { + "epoch": 1.193938062815726, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.373734474182129, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.710580050945282, + "num_tokens": 281426171.0, + "step": 10872 + }, + { + "epoch": 1.1940478805183394, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.423855781555176, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.705010712146759, + "num_tokens": 281452592.0, + "step": 10873 + }, + { + "epoch": 1.1941576982209532, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2817556858062744, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6990653276443481, + "num_tokens": 281479924.0, + "step": 10874 + }, + { + "epoch": 1.194267515923567, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4798786640167236, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7095787525177002, + "num_tokens": 281503398.0, + "step": 10875 + }, + { + "epoch": 1.1943773336261805, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.555537462234497, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7126141786575317, + "num_tokens": 281525676.0, + "step": 10876 + }, + { + "epoch": 1.1944871513287942, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5945775508880615, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7162482738494873, + "num_tokens": 281551631.0, + "step": 10877 + }, + { + "epoch": 1.1945969690314078, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.687899112701416, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7111575603485107, + "num_tokens": 281572796.0, + "step": 10878 + }, + { + "epoch": 1.1947067867340215, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5572783946990967, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7059766054153442, + "num_tokens": 281597170.0, + "step": 10879 + }, + { + "epoch": 1.1948166044366353, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.518108367919922, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7181714177131653, + "num_tokens": 281620614.0, + "step": 10880 + }, + { + "epoch": 1.1949264221392488, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3617875576019287, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7108378410339355, + "num_tokens": 281646288.0, + "step": 10881 + }, + { + "epoch": 1.1950362398418626, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5709104537963867, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.722827136516571, + "num_tokens": 281667782.0, + "step": 10882 + }, + { + "epoch": 1.1951460575444761, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4368937015533447, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.716049313545227, + "num_tokens": 281694308.0, + "step": 10883 + }, + { + "epoch": 1.1952558752470899, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.582698106765747, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7237414717674255, + "num_tokens": 281716188.0, + "step": 10884 + }, + { + "epoch": 1.1953656929497034, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.330495834350586, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6881675720214844, + "num_tokens": 281743927.0, + "step": 10885 + }, + { + "epoch": 1.1954755106523172, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1980600357055664, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.710061252117157, + "num_tokens": 281770630.0, + "step": 10886 + }, + { + "epoch": 1.1955853283549307, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5862958431243896, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6937229633331299, + "num_tokens": 281793140.0, + "step": 10887 + }, + { + "epoch": 1.1956951460575445, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6144890785217285, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7221387624740601, + "num_tokens": 281814623.0, + "step": 10888 + }, + { + "epoch": 1.1958049637601582, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4911911487579346, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7087594866752625, + "num_tokens": 281838998.0, + "step": 10889 + }, + { + "epoch": 1.1959147814627717, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2905876636505127, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7236498594284058, + "num_tokens": 281867363.0, + "step": 10890 + }, + { + "epoch": 1.1960245991653855, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5815255641937256, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7177706956863403, + "num_tokens": 281889231.0, + "step": 10891 + }, + { + "epoch": 1.196134416867999, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3322019577026367, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7015857696533203, + "num_tokens": 281917109.0, + "step": 10892 + }, + { + "epoch": 1.1962442345706128, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4453296661376953, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6997478008270264, + "num_tokens": 281942238.0, + "step": 10893 + }, + { + "epoch": 1.1963540522732266, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7262661457061768, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7159301042556763, + "num_tokens": 281963200.0, + "step": 10894 + }, + { + "epoch": 1.19646386997584, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.30975604057312, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6989707946777344, + "num_tokens": 281991008.0, + "step": 10895 + }, + { + "epoch": 1.1965736876784538, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.558150053024292, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7251238822937012, + "num_tokens": 282013040.0, + "step": 10896 + }, + { + "epoch": 1.1966835053810674, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6898608207702637, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7191562652587891, + "num_tokens": 282033724.0, + "step": 10897 + }, + { + "epoch": 1.1967933230836811, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.239877700805664, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6951935291290283, + "num_tokens": 282064888.0, + "step": 10898 + }, + { + "epoch": 1.1969031407862947, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3574702739715576, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.712963342666626, + "num_tokens": 282091842.0, + "step": 10899 + }, + { + "epoch": 1.1970129584889084, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1719706058502197, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7215143442153931, + "num_tokens": 282121276.0, + "step": 10900 + }, + { + "epoch": 1.197122776191522, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.9719042778015137, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7212917804718018, + "num_tokens": 282138436.0, + "step": 10901 + }, + { + "epoch": 1.1972325938941357, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.50580096244812, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7199342846870422, + "num_tokens": 282161974.0, + "step": 10902 + }, + { + "epoch": 1.1973424115967495, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4413373470306396, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6874780654907227, + "num_tokens": 282187070.0, + "step": 10903 + }, + { + "epoch": 1.197452229299363, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7772727012634277, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7058598399162292, + "num_tokens": 282210391.0, + "step": 10904 + }, + { + "epoch": 1.1975620470019768, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4240596294403076, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7225186824798584, + "num_tokens": 282236268.0, + "step": 10905 + }, + { + "epoch": 1.1976718647045903, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1773693561553955, + "learning_rate": 1e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.684664249420166, + "num_tokens": 282266725.0, + "step": 10906 + }, + { + "epoch": 1.197781682407204, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6037356853485107, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7428321242332458, + "num_tokens": 282286293.0, + "step": 10907 + }, + { + "epoch": 1.1978915001098178, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.216369390487671, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6979498863220215, + "num_tokens": 282317493.0, + "step": 10908 + }, + { + "epoch": 1.1980013178124314, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4849259853363037, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6857670545578003, + "num_tokens": 282342822.0, + "step": 10909 + }, + { + "epoch": 1.1981111355150451, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.459277629852295, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7343349456787109, + "num_tokens": 282367191.0, + "step": 10910 + }, + { + "epoch": 1.1982209532176586, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2903709411621094, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.690385639667511, + "num_tokens": 282397156.0, + "step": 10911 + }, + { + "epoch": 1.1983307709202724, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3882603645324707, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7231229543685913, + "num_tokens": 282422169.0, + "step": 10912 + }, + { + "epoch": 1.198440588622886, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2855849266052246, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.713468074798584, + "num_tokens": 282451878.0, + "step": 10913 + }, + { + "epoch": 1.1985504063254997, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.375157594680786, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6965593099594116, + "num_tokens": 282479141.0, + "step": 10914 + }, + { + "epoch": 1.1986602240281132, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.2109737396240234, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6982251405715942, + "num_tokens": 282508341.0, + "step": 10915 + }, + { + "epoch": 1.198770041730727, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3087871074676514, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.721275806427002, + "num_tokens": 282533930.0, + "step": 10916 + }, + { + "epoch": 1.1988798594333407, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5032055377960205, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7129194140434265, + "num_tokens": 282556891.0, + "step": 10917 + }, + { + "epoch": 1.1989896771359543, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.654966354370117, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7181729078292847, + "num_tokens": 282578796.0, + "step": 10918 + }, + { + "epoch": 1.199099494838568, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6314570903778076, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7027499079704285, + "num_tokens": 282603968.0, + "step": 10919 + }, + { + "epoch": 1.1992093125411816, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7872655391693115, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7163454294204712, + "num_tokens": 282623436.0, + "step": 10920 + }, + { + "epoch": 1.1993191302437953, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.543062448501587, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.715848445892334, + "num_tokens": 282644872.0, + "step": 10921 + }, + { + "epoch": 1.199428947946409, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3166494369506836, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6794897317886353, + "num_tokens": 282675436.0, + "step": 10922 + }, + { + "epoch": 1.1995387656490226, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5340383052825928, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7323973774909973, + "num_tokens": 282698546.0, + "step": 10923 + }, + { + "epoch": 1.1996485833516364, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4138405323028564, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7058553695678711, + "num_tokens": 282722349.0, + "step": 10924 + }, + { + "epoch": 1.19975840105425, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.533278465270996, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6944894790649414, + "num_tokens": 282745554.0, + "step": 10925 + }, + { + "epoch": 1.1998682187568637, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.559034824371338, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7156375646591187, + "num_tokens": 282770136.0, + "step": 10926 + }, + { + "epoch": 1.1999780364594772, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.534393548965454, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7057740688323975, + "num_tokens": 282793880.0, + "step": 10927 + }, + { + "epoch": 1.200087854162091, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.491232395172119, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7114397287368774, + "num_tokens": 282819050.0, + "step": 10928 + }, + { + "epoch": 1.2001976718647045, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6986730098724365, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7215216159820557, + "num_tokens": 282838645.0, + "step": 10929 + }, + { + "epoch": 1.2003074895673183, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.1256215572357178, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7050980925559998, + "num_tokens": 282870167.0, + "step": 10930 + }, + { + "epoch": 1.200417307269932, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.182868242263794, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.709877610206604, + "num_tokens": 282900252.0, + "step": 10931 + }, + { + "epoch": 1.2005271249725455, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3730783462524414, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7045966386795044, + "num_tokens": 282924793.0, + "step": 10932 + }, + { + "epoch": 1.2006369426751593, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3393783569335938, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.703734278678894, + "num_tokens": 282951368.0, + "step": 10933 + }, + { + "epoch": 1.2007467603777728, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.156905174255371, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6818828582763672, + "num_tokens": 282982310.0, + "step": 10934 + }, + { + "epoch": 1.2008565780803866, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.593536138534546, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.7022439241409302, + "num_tokens": 283005262.0, + "step": 10935 + }, + { + "epoch": 1.2009663957830001, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4805023670196533, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7261672019958496, + "num_tokens": 283029265.0, + "step": 10936 + }, + { + "epoch": 1.2010762134856139, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4175233840942383, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7279085516929626, + "num_tokens": 283055446.0, + "step": 10937 + }, + { + "epoch": 1.2011860311882274, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4986627101898193, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7415717840194702, + "num_tokens": 283080794.0, + "step": 10938 + }, + { + "epoch": 1.2012958488908412, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5128591060638428, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7098811864852905, + "num_tokens": 283103477.0, + "step": 10939 + }, + { + "epoch": 1.201405666593455, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.456552743911743, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6944064497947693, + "num_tokens": 283128921.0, + "step": 10940 + }, + { + "epoch": 1.2015154842960685, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.140934705734253, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7061328887939453, + "num_tokens": 283159069.0, + "step": 10941 + }, + { + "epoch": 1.2016253019986822, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7246947288513184, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7098879814147949, + "num_tokens": 283182509.0, + "step": 10942 + }, + { + "epoch": 1.2017351197012958, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4563965797424316, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6841181516647339, + "num_tokens": 283211447.0, + "step": 10943 + }, + { + "epoch": 1.2018449374039095, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.287421941757202, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.713983952999115, + "num_tokens": 283239895.0, + "step": 10944 + }, + { + "epoch": 1.2019547551065233, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.87646222114563, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7044235467910767, + "num_tokens": 283260397.0, + "step": 10945 + }, + { + "epoch": 1.2020645728091368, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.401101589202881, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7263122797012329, + "num_tokens": 283285092.0, + "step": 10946 + }, + { + "epoch": 1.2021743905117506, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5284132957458496, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7220523357391357, + "num_tokens": 283308409.0, + "step": 10947 + }, + { + "epoch": 1.202284208214364, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3557631969451904, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6975972652435303, + "num_tokens": 283337028.0, + "step": 10948 + }, + { + "epoch": 1.2023940259169779, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3932371139526367, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7153267860412598, + "num_tokens": 283362535.0, + "step": 10949 + }, + { + "epoch": 1.2025038436195914, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3088552951812744, + "learning_rate": 1e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.6765707731246948, + "num_tokens": 283392480.0, + "step": 10950 + }, + { + "epoch": 1.2026136613222052, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5383269786834717, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7158038020133972, + "num_tokens": 283414290.0, + "step": 10951 + }, + { + "epoch": 1.2027234790248187, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.712395191192627, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7325783967971802, + "num_tokens": 283434515.0, + "step": 10952 + }, + { + "epoch": 1.2028332967274324, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.460425853729248, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7155998349189758, + "num_tokens": 283459691.0, + "step": 10953 + }, + { + "epoch": 1.2029431144300462, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4712600708007812, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7165234088897705, + "num_tokens": 283483621.0, + "step": 10954 + }, + { + "epoch": 1.2030529321326597, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.471299648284912, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7250542044639587, + "num_tokens": 283507753.0, + "step": 10955 + }, + { + "epoch": 1.2031627498352735, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4626293182373047, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7073354721069336, + "num_tokens": 283532007.0, + "step": 10956 + }, + { + "epoch": 1.203272567537887, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.566324472427368, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6927191019058228, + "num_tokens": 283555781.0, + "step": 10957 + }, + { + "epoch": 1.2033823852405008, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.569140911102295, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.726574182510376, + "num_tokens": 283578730.0, + "step": 10958 + }, + { + "epoch": 1.2034922029431145, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4026360511779785, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7396463751792908, + "num_tokens": 283602621.0, + "step": 10959 + }, + { + "epoch": 1.203602020645728, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.0246529579162598, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7099798917770386, + "num_tokens": 283638322.0, + "step": 10960 + }, + { + "epoch": 1.2037118383483418, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.221592664718628, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7059030532836914, + "num_tokens": 283668495.0, + "step": 10961 + }, + { + "epoch": 1.2038216560509554, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.7136263847351074, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7208629846572876, + "num_tokens": 283688969.0, + "step": 10962 + }, + { + "epoch": 1.2039314737535691, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3842878341674805, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6926632523536682, + "num_tokens": 283717810.0, + "step": 10963 + }, + { + "epoch": 1.2040412914561827, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6288399696350098, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6954694986343384, + "num_tokens": 283740254.0, + "step": 10964 + }, + { + "epoch": 1.2041511091587964, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.6979904174804688, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7178425788879395, + "num_tokens": 283760259.0, + "step": 10965 + }, + { + "epoch": 1.20426092686141, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4764370918273926, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7075793743133545, + "num_tokens": 283784178.0, + "step": 10966 + }, + { + "epoch": 1.2043707445640237, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.80841064453125, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7339751124382019, + "num_tokens": 283806168.0, + "step": 10967 + }, + { + "epoch": 1.2044805622666375, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.4705123901367188, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7202216386795044, + "num_tokens": 283830981.0, + "step": 10968 + }, + { + "epoch": 1.204590379969251, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.3788392543792725, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7050951719284058, + "num_tokens": 283856809.0, + "step": 10969 + }, + { + "epoch": 1.2047001976718648, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.5230772495269775, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7148934006690979, + "num_tokens": 283880097.0, + "step": 10970 + }, + { + "epoch": 1.2048100153744783, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.529806613922119, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7242746949195862, + "num_tokens": 283904831.0, + "step": 10971 + }, + { + "epoch": 1.204919833077092, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 2.491798162460327, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.7017999887466431, + "num_tokens": 283931547.0, + "step": 10972 + }, + { + "epoch": 1.2050296507797058, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.7283387184143066, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7167242765426636, + "num_tokens": 283950698.0, + "step": 10973 + }, + { + "epoch": 1.2051394684823193, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6223795413970947, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7187550663948059, + "num_tokens": 283974470.0, + "step": 10974 + }, + { + "epoch": 1.205249286184933, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4331188201904297, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6975289583206177, + "num_tokens": 284001087.0, + "step": 10975 + }, + { + "epoch": 1.2053591038875466, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.8419923782348633, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7312173843383789, + "num_tokens": 284020322.0, + "step": 10976 + }, + { + "epoch": 1.2054689215901604, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.7853190898895264, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6904280185699463, + "num_tokens": 284040858.0, + "step": 10977 + }, + { + "epoch": 1.205578739292774, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.359813928604126, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6951537132263184, + "num_tokens": 284067636.0, + "step": 10978 + }, + { + "epoch": 1.2056885569953877, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.313396453857422, + "learning_rate": 1e-06, + "loss": 1.0982, + "mean_token_accuracy": 0.6939266920089722, + "num_tokens": 284096336.0, + "step": 10979 + }, + { + "epoch": 1.2057983746980012, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.549048900604248, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7026766538619995, + "num_tokens": 284120413.0, + "step": 10980 + }, + { + "epoch": 1.205908192400615, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3582844734191895, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7168699502944946, + "num_tokens": 284145303.0, + "step": 10981 + }, + { + "epoch": 1.2060180101032287, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3876709938049316, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7542628645896912, + "num_tokens": 284167398.0, + "step": 10982 + }, + { + "epoch": 1.2061278278058423, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4202728271484375, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7139874696731567, + "num_tokens": 284191959.0, + "step": 10983 + }, + { + "epoch": 1.206237645508456, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.386108636856079, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7090653777122498, + "num_tokens": 284217135.0, + "step": 10984 + }, + { + "epoch": 1.2063474632110696, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1233582496643066, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.687002420425415, + "num_tokens": 284252636.0, + "step": 10985 + }, + { + "epoch": 1.2064572809136833, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3720898628234863, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7224048376083374, + "num_tokens": 284277462.0, + "step": 10986 + }, + { + "epoch": 1.2065670986162969, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.569631814956665, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7393494844436646, + "num_tokens": 284298181.0, + "step": 10987 + }, + { + "epoch": 1.2066769163189106, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2221293449401855, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7049550414085388, + "num_tokens": 284326972.0, + "step": 10988 + }, + { + "epoch": 1.2067867340215244, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4844205379486084, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7183173298835754, + "num_tokens": 284352298.0, + "step": 10989 + }, + { + "epoch": 1.206896551724138, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.8010668754577637, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7135804295539856, + "num_tokens": 284371893.0, + "step": 10990 + }, + { + "epoch": 1.2070063694267517, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4261162281036377, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6988564133644104, + "num_tokens": 284398281.0, + "step": 10991 + }, + { + "epoch": 1.2071161871293652, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5317909717559814, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.708572268486023, + "num_tokens": 284421089.0, + "step": 10992 + }, + { + "epoch": 1.207226004831979, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.729273796081543, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7243716716766357, + "num_tokens": 284441192.0, + "step": 10993 + }, + { + "epoch": 1.2073358225345925, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.266090154647827, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6897398233413696, + "num_tokens": 284471296.0, + "step": 10994 + }, + { + "epoch": 1.2074456402372062, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3382506370544434, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7016239762306213, + "num_tokens": 284497238.0, + "step": 10995 + }, + { + "epoch": 1.20755545793982, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2314257621765137, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6932120323181152, + "num_tokens": 284525660.0, + "step": 10996 + }, + { + "epoch": 1.2076652756424335, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2927005290985107, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7158998847007751, + "num_tokens": 284554370.0, + "step": 10997 + }, + { + "epoch": 1.2077750933450473, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 3.9633047580718994, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7189849019050598, + "num_tokens": 284577055.0, + "step": 10998 + }, + { + "epoch": 1.2078849110476608, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.50355863571167, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7245818972587585, + "num_tokens": 284601534.0, + "step": 10999 + }, + { + "epoch": 1.2079947287502746, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6260006427764893, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7099681496620178, + "num_tokens": 284624533.0, + "step": 11000 + }, + { + "epoch": 1.2081045464528881, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.170480728149414, + "learning_rate": 1e-06, + "loss": 1.1071, + "mean_token_accuracy": 0.6715143918991089, + "num_tokens": 284655548.0, + "step": 11001 + }, + { + "epoch": 1.2082143641555019, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.39449143409729, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7074770331382751, + "num_tokens": 284678884.0, + "step": 11002 + }, + { + "epoch": 1.2083241818581154, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6183419227600098, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7069870829582214, + "num_tokens": 284703846.0, + "step": 11003 + }, + { + "epoch": 1.2084339995607292, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.431723117828369, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7149643898010254, + "num_tokens": 284729345.0, + "step": 11004 + }, + { + "epoch": 1.208543817263343, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1807217597961426, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.713647723197937, + "num_tokens": 284759880.0, + "step": 11005 + }, + { + "epoch": 1.2086536349659565, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.9250829219818115, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7282652258872986, + "num_tokens": 284778553.0, + "step": 11006 + }, + { + "epoch": 1.2087634526685702, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.314131259918213, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7272412180900574, + "num_tokens": 284803349.0, + "step": 11007 + }, + { + "epoch": 1.2088732703711838, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2302258014678955, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6844249963760376, + "num_tokens": 284836162.0, + "step": 11008 + }, + { + "epoch": 1.2089830880737975, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6842541694641113, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7200814485549927, + "num_tokens": 284855788.0, + "step": 11009 + }, + { + "epoch": 1.2090929057764113, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.647350788116455, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.711125373840332, + "num_tokens": 284880287.0, + "step": 11010 + }, + { + "epoch": 1.2092027234790248, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.449800968170166, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7193650007247925, + "num_tokens": 284904543.0, + "step": 11011 + }, + { + "epoch": 1.2093125411816386, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2799766063690186, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7032333612442017, + "num_tokens": 284933876.0, + "step": 11012 + }, + { + "epoch": 1.209422358884252, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.446608304977417, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7008825540542603, + "num_tokens": 284961257.0, + "step": 11013 + }, + { + "epoch": 1.2095321765868658, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4139351844787598, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6879010200500488, + "num_tokens": 284988656.0, + "step": 11014 + }, + { + "epoch": 1.2096419942894794, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.7727954387664795, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7238491773605347, + "num_tokens": 285009556.0, + "step": 11015 + }, + { + "epoch": 1.2097518119920931, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.469109535217285, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7323288917541504, + "num_tokens": 285034371.0, + "step": 11016 + }, + { + "epoch": 1.2098616296947067, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4833409786224365, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7182148694992065, + "num_tokens": 285056245.0, + "step": 11017 + }, + { + "epoch": 1.2099714473973204, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3919107913970947, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7114155292510986, + "num_tokens": 285081384.0, + "step": 11018 + }, + { + "epoch": 1.2100812650999342, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.7239084243774414, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7189764976501465, + "num_tokens": 285101546.0, + "step": 11019 + }, + { + "epoch": 1.2101910828025477, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3388333320617676, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7269070148468018, + "num_tokens": 285128909.0, + "step": 11020 + }, + { + "epoch": 1.2103009005051615, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.793159246444702, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7031609416007996, + "num_tokens": 285149217.0, + "step": 11021 + }, + { + "epoch": 1.210410718207775, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.784313201904297, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7356520891189575, + "num_tokens": 285173057.0, + "step": 11022 + }, + { + "epoch": 1.2105205359103888, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4980692863464355, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.720573902130127, + "num_tokens": 285198193.0, + "step": 11023 + }, + { + "epoch": 1.2106303536130025, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.405888319015503, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7027902603149414, + "num_tokens": 285226208.0, + "step": 11024 + }, + { + "epoch": 1.210740171315616, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4667775630950928, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7080662250518799, + "num_tokens": 285249441.0, + "step": 11025 + }, + { + "epoch": 1.2108499890182298, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.355668544769287, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7346641421318054, + "num_tokens": 285274785.0, + "step": 11026 + }, + { + "epoch": 1.2109598067208434, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.0586864948272705, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6850863695144653, + "num_tokens": 285311003.0, + "step": 11027 + }, + { + "epoch": 1.2110696244234571, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.492445230484009, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.71649169921875, + "num_tokens": 285334538.0, + "step": 11028 + }, + { + "epoch": 1.2111794421260706, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.400538206100464, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7097886204719543, + "num_tokens": 285361596.0, + "step": 11029 + }, + { + "epoch": 1.2112892598286844, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.272272825241089, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6986719965934753, + "num_tokens": 285389838.0, + "step": 11030 + }, + { + "epoch": 1.211399077531298, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2629411220550537, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6932933330535889, + "num_tokens": 285417739.0, + "step": 11031 + }, + { + "epoch": 1.2115088952339117, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.924283742904663, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7239159345626831, + "num_tokens": 285435299.0, + "step": 11032 + }, + { + "epoch": 1.2116187129365255, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4886245727539062, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7078012824058533, + "num_tokens": 285460233.0, + "step": 11033 + }, + { + "epoch": 1.211728530639139, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5895559787750244, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7045660614967346, + "num_tokens": 285485153.0, + "step": 11034 + }, + { + "epoch": 1.2118383483417527, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.412879467010498, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7237927913665771, + "num_tokens": 285510008.0, + "step": 11035 + }, + { + "epoch": 1.2119481660443663, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4105193614959717, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.706609845161438, + "num_tokens": 285535003.0, + "step": 11036 + }, + { + "epoch": 1.21205798374698, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.162598133087158, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7034196257591248, + "num_tokens": 285565024.0, + "step": 11037 + }, + { + "epoch": 1.2121678014495938, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1012961864471436, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7153725624084473, + "num_tokens": 285598590.0, + "step": 11038 + }, + { + "epoch": 1.2122776191522073, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5830979347229004, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7150867581367493, + "num_tokens": 285619570.0, + "step": 11039 + }, + { + "epoch": 1.212387436854821, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.372476577758789, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6908993721008301, + "num_tokens": 285644388.0, + "step": 11040 + }, + { + "epoch": 1.2124972545574346, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2166101932525635, + "learning_rate": 1e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6876094341278076, + "num_tokens": 285676890.0, + "step": 11041 + }, + { + "epoch": 1.2126070722600484, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.791350841522217, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7286977767944336, + "num_tokens": 285694848.0, + "step": 11042 + }, + { + "epoch": 1.212716889962662, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.832413911819458, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7208811044692993, + "num_tokens": 285713171.0, + "step": 11043 + }, + { + "epoch": 1.2128267076652757, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.45051908493042, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7431440353393555, + "num_tokens": 285736481.0, + "step": 11044 + }, + { + "epoch": 1.2129365253678892, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.170348644256592, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6852178573608398, + "num_tokens": 285766170.0, + "step": 11045 + }, + { + "epoch": 1.213046343070503, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 3.0276269912719727, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7308754324913025, + "num_tokens": 285782962.0, + "step": 11046 + }, + { + "epoch": 1.2131561607731167, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.552290201187134, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.730118989944458, + "num_tokens": 285805223.0, + "step": 11047 + }, + { + "epoch": 1.2132659784757303, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4398560523986816, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7131692171096802, + "num_tokens": 285830478.0, + "step": 11048 + }, + { + "epoch": 1.213375796178344, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.910794496536255, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7484549283981323, + "num_tokens": 285848620.0, + "step": 11049 + }, + { + "epoch": 1.2134856138809575, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3113458156585693, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7024779319763184, + "num_tokens": 285876089.0, + "step": 11050 + }, + { + "epoch": 1.2135954315835713, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.474611520767212, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7329570055007935, + "num_tokens": 285900376.0, + "step": 11051 + }, + { + "epoch": 1.2137052492861848, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4637320041656494, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7270668745040894, + "num_tokens": 285923703.0, + "step": 11052 + }, + { + "epoch": 1.2138150669887986, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.466278553009033, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6919456124305725, + "num_tokens": 285950717.0, + "step": 11053 + }, + { + "epoch": 1.2139248846914121, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.641810655593872, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.694085955619812, + "num_tokens": 285973567.0, + "step": 11054 + }, + { + "epoch": 1.2140347023940259, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4945106506347656, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.702534019947052, + "num_tokens": 285997017.0, + "step": 11055 + }, + { + "epoch": 1.2141445200966396, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1534292697906494, + "learning_rate": 1e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.6759344339370728, + "num_tokens": 286028655.0, + "step": 11056 + }, + { + "epoch": 1.2142543377992532, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.373756170272827, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7055659294128418, + "num_tokens": 286054292.0, + "step": 11057 + }, + { + "epoch": 1.214364155501867, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3614501953125, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7202445268630981, + "num_tokens": 286079944.0, + "step": 11058 + }, + { + "epoch": 1.2144739732044805, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.304736614227295, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6916931867599487, + "num_tokens": 286108619.0, + "step": 11059 + }, + { + "epoch": 1.2145837909070942, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1982457637786865, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7223743200302124, + "num_tokens": 286137641.0, + "step": 11060 + }, + { + "epoch": 1.214693608609708, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3600974082946777, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7126969695091248, + "num_tokens": 286163781.0, + "step": 11061 + }, + { + "epoch": 1.2148034263123215, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3050127029418945, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7017171382904053, + "num_tokens": 286193706.0, + "step": 11062 + }, + { + "epoch": 1.2149132440149353, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2337968349456787, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6996860504150391, + "num_tokens": 286224522.0, + "step": 11063 + }, + { + "epoch": 1.2150230617175488, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.278470039367676, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7277843952178955, + "num_tokens": 286249405.0, + "step": 11064 + }, + { + "epoch": 1.2151328794201626, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.271636962890625, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7140578627586365, + "num_tokens": 286278531.0, + "step": 11065 + }, + { + "epoch": 1.215242697122776, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1403656005859375, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7059956789016724, + "num_tokens": 286308331.0, + "step": 11066 + }, + { + "epoch": 1.2153525148253899, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2318484783172607, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6994990110397339, + "num_tokens": 286339208.0, + "step": 11067 + }, + { + "epoch": 1.2154623325280034, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4464194774627686, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7267762422561646, + "num_tokens": 286361971.0, + "step": 11068 + }, + { + "epoch": 1.2155721502306172, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1260342597961426, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7110533714294434, + "num_tokens": 286392775.0, + "step": 11069 + }, + { + "epoch": 1.215681967933231, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.794804811477661, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7206124663352966, + "num_tokens": 286411263.0, + "step": 11070 + }, + { + "epoch": 1.2157917856358444, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.326993703842163, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7037649750709534, + "num_tokens": 286437434.0, + "step": 11071 + }, + { + "epoch": 1.2159016033384582, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.39595627784729, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7157933115959167, + "num_tokens": 286461643.0, + "step": 11072 + }, + { + "epoch": 1.2160114210410717, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.163740873336792, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7002161741256714, + "num_tokens": 286493286.0, + "step": 11073 + }, + { + "epoch": 1.2161212387436855, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6215713024139404, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7086915969848633, + "num_tokens": 286514586.0, + "step": 11074 + }, + { + "epoch": 1.2162310564462993, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.785609722137451, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7314404845237732, + "num_tokens": 286533142.0, + "step": 11075 + }, + { + "epoch": 1.2163408741489128, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2983086109161377, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7061387896537781, + "num_tokens": 286560108.0, + "step": 11076 + }, + { + "epoch": 1.2164506918515265, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1402876377105713, + "learning_rate": 1e-06, + "loss": 1.1081, + "mean_token_accuracy": 0.6817337870597839, + "num_tokens": 286592101.0, + "step": 11077 + }, + { + "epoch": 1.21656050955414, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.446115016937256, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.704261064529419, + "num_tokens": 286614678.0, + "step": 11078 + }, + { + "epoch": 1.2166703272567538, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.620598793029785, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.704657256603241, + "num_tokens": 286636211.0, + "step": 11079 + }, + { + "epoch": 1.2167801449593674, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4058876037597656, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7121021747589111, + "num_tokens": 286662911.0, + "step": 11080 + }, + { + "epoch": 1.2168899626619811, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2989559173583984, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7114919424057007, + "num_tokens": 286690482.0, + "step": 11081 + }, + { + "epoch": 1.2169997803645947, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3740086555480957, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.693650484085083, + "num_tokens": 286716439.0, + "step": 11082 + }, + { + "epoch": 1.2171095980672084, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.131885051727295, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7200212478637695, + "num_tokens": 286745981.0, + "step": 11083 + }, + { + "epoch": 1.2172194157698222, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.289006233215332, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7081539630889893, + "num_tokens": 286774086.0, + "step": 11084 + }, + { + "epoch": 1.2173292334724357, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.44832444190979, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.713388204574585, + "num_tokens": 286797551.0, + "step": 11085 + }, + { + "epoch": 1.2174390511750495, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5722784996032715, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7099835872650146, + "num_tokens": 286821481.0, + "step": 11086 + }, + { + "epoch": 1.217548868877663, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2940680980682373, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6957706809043884, + "num_tokens": 286851130.0, + "step": 11087 + }, + { + "epoch": 1.2176586865802768, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.798020124435425, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7198944091796875, + "num_tokens": 286870940.0, + "step": 11088 + }, + { + "epoch": 1.2177685042828905, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5161192417144775, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7206360697746277, + "num_tokens": 286893884.0, + "step": 11089 + }, + { + "epoch": 1.217878321985504, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5602219104766846, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7288591265678406, + "num_tokens": 286916003.0, + "step": 11090 + }, + { + "epoch": 1.2179881396881178, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6600265502929688, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7004584670066833, + "num_tokens": 286938420.0, + "step": 11091 + }, + { + "epoch": 1.2180979573907313, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.8998847007751465, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7286345958709717, + "num_tokens": 286958223.0, + "step": 11092 + }, + { + "epoch": 1.218207775093345, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.438249349594116, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7138698101043701, + "num_tokens": 286982199.0, + "step": 11093 + }, + { + "epoch": 1.2183175927959586, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2853829860687256, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7092686295509338, + "num_tokens": 287010430.0, + "step": 11094 + }, + { + "epoch": 1.2184274104985724, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6799683570861816, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.71669602394104, + "num_tokens": 287030970.0, + "step": 11095 + }, + { + "epoch": 1.218537228201186, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3079142570495605, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6986912488937378, + "num_tokens": 287057132.0, + "step": 11096 + }, + { + "epoch": 1.2186470459037997, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.445762872695923, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7260915040969849, + "num_tokens": 287082012.0, + "step": 11097 + }, + { + "epoch": 1.2187568636064134, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 7.089148998260498, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6954022645950317, + "num_tokens": 287109465.0, + "step": 11098 + }, + { + "epoch": 1.218866681309027, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.373612880706787, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7394102215766907, + "num_tokens": 287135634.0, + "step": 11099 + }, + { + "epoch": 1.2189764990116407, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.9834752082824707, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7267872095108032, + "num_tokens": 287151167.0, + "step": 11100 + }, + { + "epoch": 1.2190863167142543, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4369595050811768, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7331076860427856, + "num_tokens": 287176289.0, + "step": 11101 + }, + { + "epoch": 1.219196134416868, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.557154655456543, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7033244967460632, + "num_tokens": 287199749.0, + "step": 11102 + }, + { + "epoch": 1.2193059521194818, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.247352361679077, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6976374983787537, + "num_tokens": 287229177.0, + "step": 11103 + }, + { + "epoch": 1.2194157698220953, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5146660804748535, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.70192950963974, + "num_tokens": 287257362.0, + "step": 11104 + }, + { + "epoch": 1.219525587524709, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2212412357330322, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7156118154525757, + "num_tokens": 287286351.0, + "step": 11105 + }, + { + "epoch": 1.2196354052273226, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4833431243896484, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7108219861984253, + "num_tokens": 287312114.0, + "step": 11106 + }, + { + "epoch": 1.2197452229299364, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2858002185821533, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7168664336204529, + "num_tokens": 287340424.0, + "step": 11107 + }, + { + "epoch": 1.21985504063255, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5209360122680664, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6984385848045349, + "num_tokens": 287365830.0, + "step": 11108 + }, + { + "epoch": 1.2199648583351637, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2828383445739746, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7223970890045166, + "num_tokens": 287392435.0, + "step": 11109 + }, + { + "epoch": 1.2200746760377772, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.819293260574341, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7210659980773926, + "num_tokens": 287412571.0, + "step": 11110 + }, + { + "epoch": 1.220184493740391, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2928740978240967, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.70630943775177, + "num_tokens": 287441777.0, + "step": 11111 + }, + { + "epoch": 1.2202943114430047, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.671617269515991, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7058534622192383, + "num_tokens": 287462481.0, + "step": 11112 + }, + { + "epoch": 1.2204041291456182, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.263375759124756, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.70723956823349, + "num_tokens": 287491128.0, + "step": 11113 + }, + { + "epoch": 1.220513946848232, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.9904534816741943, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7159973382949829, + "num_tokens": 287524819.0, + "step": 11114 + }, + { + "epoch": 1.2206237645508455, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.9770116806030273, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7091457843780518, + "num_tokens": 287562937.0, + "step": 11115 + }, + { + "epoch": 1.2207335822534593, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5619099140167236, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7334587574005127, + "num_tokens": 287584404.0, + "step": 11116 + }, + { + "epoch": 1.2208433999560728, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3390839099884033, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7381773591041565, + "num_tokens": 287609866.0, + "step": 11117 + }, + { + "epoch": 1.2209532176586866, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.18835711479187, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6988477110862732, + "num_tokens": 287639847.0, + "step": 11118 + }, + { + "epoch": 1.2210630353613001, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1545610427856445, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7108167409896851, + "num_tokens": 287670981.0, + "step": 11119 + }, + { + "epoch": 1.2211728530639139, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2702090740203857, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.70174241065979, + "num_tokens": 287701178.0, + "step": 11120 + }, + { + "epoch": 1.2212826707665276, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3248066902160645, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6951676607131958, + "num_tokens": 287728371.0, + "step": 11121 + }, + { + "epoch": 1.2213924884691412, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4525680541992188, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7134374976158142, + "num_tokens": 287752359.0, + "step": 11122 + }, + { + "epoch": 1.221502306171755, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5813333988189697, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7244952917098999, + "num_tokens": 287774074.0, + "step": 11123 + }, + { + "epoch": 1.2216121238743685, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.202039957046509, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6916576623916626, + "num_tokens": 287805557.0, + "step": 11124 + }, + { + "epoch": 1.2217219415769822, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4229660034179688, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6965557932853699, + "num_tokens": 287831463.0, + "step": 11125 + }, + { + "epoch": 1.221831759279596, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5877108573913574, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.732306718826294, + "num_tokens": 287852361.0, + "step": 11126 + }, + { + "epoch": 1.2219415769822095, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.777212142944336, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7175716161727905, + "num_tokens": 287871387.0, + "step": 11127 + }, + { + "epoch": 1.2220513946848233, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4767839908599854, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7100635766983032, + "num_tokens": 287897123.0, + "step": 11128 + }, + { + "epoch": 1.2221612123874368, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5586583614349365, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7108384370803833, + "num_tokens": 287920282.0, + "step": 11129 + }, + { + "epoch": 1.2222710300900506, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.450657367706299, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7074345350265503, + "num_tokens": 287945621.0, + "step": 11130 + }, + { + "epoch": 1.222380847792664, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2888805866241455, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.72519451379776, + "num_tokens": 287972594.0, + "step": 11131 + }, + { + "epoch": 1.2224906654952779, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1140201091766357, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6963175535202026, + "num_tokens": 288005889.0, + "step": 11132 + }, + { + "epoch": 1.2226004831978914, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5615668296813965, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7217420935630798, + "num_tokens": 288026620.0, + "step": 11133 + }, + { + "epoch": 1.2227103009005051, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3155055046081543, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7075719833374023, + "num_tokens": 288055937.0, + "step": 11134 + }, + { + "epoch": 1.222820118603119, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.357288122177124, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7133715748786926, + "num_tokens": 288082276.0, + "step": 11135 + }, + { + "epoch": 1.2229299363057324, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.581601858139038, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7064694762229919, + "num_tokens": 288105292.0, + "step": 11136 + }, + { + "epoch": 1.2230397540083462, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.6029059886932373, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7181349992752075, + "num_tokens": 288131219.0, + "step": 11137 + }, + { + "epoch": 1.2231495717109597, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1699981689453125, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7042370438575745, + "num_tokens": 288164176.0, + "step": 11138 + }, + { + "epoch": 1.2232593894135735, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.546164035797119, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7078671455383301, + "num_tokens": 288188719.0, + "step": 11139 + }, + { + "epoch": 1.2233692071161872, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.7664294242858887, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7008821964263916, + "num_tokens": 288216109.0, + "step": 11140 + }, + { + "epoch": 1.2234790248188008, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5042641162872314, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7052873373031616, + "num_tokens": 288240464.0, + "step": 11141 + }, + { + "epoch": 1.2235888425214145, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3113718032836914, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7185897827148438, + "num_tokens": 288266942.0, + "step": 11142 + }, + { + "epoch": 1.223698660224028, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3385071754455566, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6972147226333618, + "num_tokens": 288295463.0, + "step": 11143 + }, + { + "epoch": 1.2238084779266418, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.387641668319702, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7222371697425842, + "num_tokens": 288319423.0, + "step": 11144 + }, + { + "epoch": 1.2239182956292554, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.8892407417297363, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7262806296348572, + "num_tokens": 288338142.0, + "step": 11145 + }, + { + "epoch": 1.2240281133318691, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.2478346824645996, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.725134551525116, + "num_tokens": 288365441.0, + "step": 11146 + }, + { + "epoch": 1.2241379310344827, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5253820419311523, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.691534698009491, + "num_tokens": 288391090.0, + "step": 11147 + }, + { + "epoch": 1.2242477487370964, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.1070127487182617, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6883738040924072, + "num_tokens": 288423660.0, + "step": 11148 + }, + { + "epoch": 1.2243575664397102, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.290102005004883, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7134913802146912, + "num_tokens": 288452342.0, + "step": 11149 + }, + { + "epoch": 1.2244673841423237, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.305354356765747, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.701659083366394, + "num_tokens": 288480932.0, + "step": 11150 + }, + { + "epoch": 1.2245772018449375, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.935875415802002, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.739776611328125, + "num_tokens": 288498077.0, + "step": 11151 + }, + { + "epoch": 1.224687019547551, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3952105045318604, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6990196704864502, + "num_tokens": 288524421.0, + "step": 11152 + }, + { + "epoch": 1.2247968372501647, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.421713352203369, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7151877284049988, + "num_tokens": 288551768.0, + "step": 11153 + }, + { + "epoch": 1.2249066549527785, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.53009295463562, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7203664183616638, + "num_tokens": 288575893.0, + "step": 11154 + }, + { + "epoch": 1.225016472655392, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5009498596191406, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7187085151672363, + "num_tokens": 288598787.0, + "step": 11155 + }, + { + "epoch": 1.2251262903580058, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5810554027557373, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7085378170013428, + "num_tokens": 288620878.0, + "step": 11156 + }, + { + "epoch": 1.2252361080606193, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.3157453536987305, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7135502696037292, + "num_tokens": 288648869.0, + "step": 11157 + }, + { + "epoch": 1.225345925763233, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.679072856903076, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7394152879714966, + "num_tokens": 288668783.0, + "step": 11158 + }, + { + "epoch": 1.2254557434658466, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.383453607559204, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7119683027267456, + "num_tokens": 288696825.0, + "step": 11159 + }, + { + "epoch": 1.2255655611684604, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.5101988315582275, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7184320092201233, + "num_tokens": 288729299.0, + "step": 11160 + }, + { + "epoch": 1.225675378871074, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.494633674621582, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7141347527503967, + "num_tokens": 288754789.0, + "step": 11161 + }, + { + "epoch": 1.2257851965736877, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4253692626953125, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6920278072357178, + "num_tokens": 288780435.0, + "step": 11162 + }, + { + "epoch": 1.2258950142763014, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4560937881469727, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7053927183151245, + "num_tokens": 288803995.0, + "step": 11163 + }, + { + "epoch": 1.226004831978915, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2443413734436035, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6988561749458313, + "num_tokens": 288832003.0, + "step": 11164 + }, + { + "epoch": 1.2261146496815287, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.282472610473633, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6993213891983032, + "num_tokens": 288862135.0, + "step": 11165 + }, + { + "epoch": 1.2262244673841423, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.182889699935913, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6956213712692261, + "num_tokens": 288890024.0, + "step": 11166 + }, + { + "epoch": 1.226334285086756, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 2.4362103939056396, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7107540369033813, + "num_tokens": 288913336.0, + "step": 11167 + }, + { + "epoch": 1.2264441027893696, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2621965408325195, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7119324207305908, + "num_tokens": 288944772.0, + "step": 11168 + }, + { + "epoch": 1.2265539204919833, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.270684242248535, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7089993357658386, + "num_tokens": 288972703.0, + "step": 11169 + }, + { + "epoch": 1.226663738194597, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.426238536834717, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.732445478439331, + "num_tokens": 288997926.0, + "step": 11170 + }, + { + "epoch": 1.2267735558972106, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5523085594177246, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6970746517181396, + "num_tokens": 289020373.0, + "step": 11171 + }, + { + "epoch": 1.2268833735998244, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.401534080505371, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6874173283576965, + "num_tokens": 289048954.0, + "step": 11172 + }, + { + "epoch": 1.226993191302438, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.6504812240600586, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7112708687782288, + "num_tokens": 289073845.0, + "step": 11173 + }, + { + "epoch": 1.2271030090050516, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.716458320617676, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7182801961898804, + "num_tokens": 289094320.0, + "step": 11174 + }, + { + "epoch": 1.2272128267076652, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.414574146270752, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7342388033866882, + "num_tokens": 289123281.0, + "step": 11175 + }, + { + "epoch": 1.227322644410279, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.6706912517547607, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6970529556274414, + "num_tokens": 289146395.0, + "step": 11176 + }, + { + "epoch": 1.2274324621128927, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.666238784790039, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7147478461265564, + "num_tokens": 289166789.0, + "step": 11177 + }, + { + "epoch": 1.2275422798155062, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 32.343013763427734, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6961361169815063, + "num_tokens": 289190811.0, + "step": 11178 + }, + { + "epoch": 1.22765209751812, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.567377805709839, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7133378982543945, + "num_tokens": 289216576.0, + "step": 11179 + }, + { + "epoch": 1.2277619152207335, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.429570436477661, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7134444713592529, + "num_tokens": 289241926.0, + "step": 11180 + }, + { + "epoch": 1.2278717329233473, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.7646918296813965, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7337994575500488, + "num_tokens": 289261528.0, + "step": 11181 + }, + { + "epoch": 1.2279815506259608, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.568913459777832, + "learning_rate": 1e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.6861231327056885, + "num_tokens": 289284892.0, + "step": 11182 + }, + { + "epoch": 1.2280913683285746, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3390204906463623, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7010160684585571, + "num_tokens": 289311750.0, + "step": 11183 + }, + { + "epoch": 1.228201186031188, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4267823696136475, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7077586054801941, + "num_tokens": 289336750.0, + "step": 11184 + }, + { + "epoch": 1.2283110037338019, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.036470890045166, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.7019883990287781, + "num_tokens": 289372893.0, + "step": 11185 + }, + { + "epoch": 1.2284208214364156, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.259748935699463, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6976161599159241, + "num_tokens": 289400788.0, + "step": 11186 + }, + { + "epoch": 1.2285306391390292, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.524649143218994, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7094802856445312, + "num_tokens": 289428204.0, + "step": 11187 + }, + { + "epoch": 1.228640456841643, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.6912007331848145, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7319352626800537, + "num_tokens": 289448773.0, + "step": 11188 + }, + { + "epoch": 1.2287502745442564, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.563255786895752, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7155874371528625, + "num_tokens": 289472201.0, + "step": 11189 + }, + { + "epoch": 1.2288600922468702, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4823415279388428, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.732472836971283, + "num_tokens": 289493334.0, + "step": 11190 + }, + { + "epoch": 1.228969909949484, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3315041065216064, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6900341510772705, + "num_tokens": 289522313.0, + "step": 11191 + }, + { + "epoch": 1.2290797276520975, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1665947437286377, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.681381106376648, + "num_tokens": 289555995.0, + "step": 11192 + }, + { + "epoch": 1.2291895453547113, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.185805320739746, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.696003794670105, + "num_tokens": 289588907.0, + "step": 11193 + }, + { + "epoch": 1.2292993630573248, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4773924350738525, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7084335088729858, + "num_tokens": 289613880.0, + "step": 11194 + }, + { + "epoch": 1.2294091807599385, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5697953701019287, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.715599000453949, + "num_tokens": 289635850.0, + "step": 11195 + }, + { + "epoch": 1.229518998462552, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.359604597091675, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7129886746406555, + "num_tokens": 289661786.0, + "step": 11196 + }, + { + "epoch": 1.2296288161651658, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2995951175689697, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7217843532562256, + "num_tokens": 289690312.0, + "step": 11197 + }, + { + "epoch": 1.2297386338677794, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4966933727264404, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6989256143569946, + "num_tokens": 289716247.0, + "step": 11198 + }, + { + "epoch": 1.2298484515703931, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.68485951423645, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6972388625144958, + "num_tokens": 289737054.0, + "step": 11199 + }, + { + "epoch": 1.2299582692730069, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.31581974029541, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.6841887831687927, + "num_tokens": 289765541.0, + "step": 11200 + }, + { + "epoch": 1.2300680869756204, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.386930227279663, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7124463319778442, + "num_tokens": 289793721.0, + "step": 11201 + }, + { + "epoch": 1.2301779046782342, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3897716999053955, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7181333899497986, + "num_tokens": 289818886.0, + "step": 11202 + }, + { + "epoch": 1.2302877223808477, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.630223274230957, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7163479328155518, + "num_tokens": 289839342.0, + "step": 11203 + }, + { + "epoch": 1.2303975400834615, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.645630359649658, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.702052116394043, + "num_tokens": 289859828.0, + "step": 11204 + }, + { + "epoch": 1.2305073577860752, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.789140224456787, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7324120998382568, + "num_tokens": 289878027.0, + "step": 11205 + }, + { + "epoch": 1.2306171754886888, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 3.923743486404419, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6957701444625854, + "num_tokens": 289902423.0, + "step": 11206 + }, + { + "epoch": 1.2307269931913025, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4647815227508545, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6995609402656555, + "num_tokens": 289926283.0, + "step": 11207 + }, + { + "epoch": 1.230836810893916, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.612570285797119, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7155038714408875, + "num_tokens": 289947044.0, + "step": 11208 + }, + { + "epoch": 1.2309466285965298, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.364529609680176, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6982133984565735, + "num_tokens": 289973967.0, + "step": 11209 + }, + { + "epoch": 1.2310564462991433, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5630786418914795, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7078108787536621, + "num_tokens": 289997008.0, + "step": 11210 + }, + { + "epoch": 1.231166264001757, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.395939826965332, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6898816823959351, + "num_tokens": 290025815.0, + "step": 11211 + }, + { + "epoch": 1.2312760817043706, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4660720825195312, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7022550702095032, + "num_tokens": 290049001.0, + "step": 11212 + }, + { + "epoch": 1.2313858994069844, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5086936950683594, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7091342210769653, + "num_tokens": 290072534.0, + "step": 11213 + }, + { + "epoch": 1.2314957171095982, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5095629692077637, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6923354864120483, + "num_tokens": 290098182.0, + "step": 11214 + }, + { + "epoch": 1.2316055348122117, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.480963706970215, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7186000943183899, + "num_tokens": 290121546.0, + "step": 11215 + }, + { + "epoch": 1.2317153525148254, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5342400074005127, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.732469916343689, + "num_tokens": 290144929.0, + "step": 11216 + }, + { + "epoch": 1.231825170217439, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4167063236236572, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.713038444519043, + "num_tokens": 290169041.0, + "step": 11217 + }, + { + "epoch": 1.2319349879200527, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.311937093734741, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7227485775947571, + "num_tokens": 290196061.0, + "step": 11218 + }, + { + "epoch": 1.2320448056226665, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3227734565734863, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7037817239761353, + "num_tokens": 290222130.0, + "step": 11219 + }, + { + "epoch": 1.23215462332528, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5483248233795166, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7211557030677795, + "num_tokens": 290245566.0, + "step": 11220 + }, + { + "epoch": 1.2322644410278938, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5200188159942627, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.700827956199646, + "num_tokens": 290268633.0, + "step": 11221 + }, + { + "epoch": 1.2323742587305073, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1557655334472656, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7315926551818848, + "num_tokens": 290298052.0, + "step": 11222 + }, + { + "epoch": 1.232484076433121, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5413293838500977, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7245123386383057, + "num_tokens": 290323306.0, + "step": 11223 + }, + { + "epoch": 1.2325938941357346, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.325143575668335, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7136520147323608, + "num_tokens": 290351222.0, + "step": 11224 + }, + { + "epoch": 1.2327037118383484, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.341409683227539, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6830775737762451, + "num_tokens": 290380297.0, + "step": 11225 + }, + { + "epoch": 1.232813529540962, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2050585746765137, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7257727384567261, + "num_tokens": 290410462.0, + "step": 11226 + }, + { + "epoch": 1.2329233472435757, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4725263118743896, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7086703181266785, + "num_tokens": 290438048.0, + "step": 11227 + }, + { + "epoch": 1.2330331649461894, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.146686315536499, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6961250901222229, + "num_tokens": 290469821.0, + "step": 11228 + }, + { + "epoch": 1.233142982648803, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.371630907058716, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7130456566810608, + "num_tokens": 290495476.0, + "step": 11229 + }, + { + "epoch": 1.2332528003514167, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.450371742248535, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.694943904876709, + "num_tokens": 290519504.0, + "step": 11230 + }, + { + "epoch": 1.2333626180540302, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2408719062805176, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6971811056137085, + "num_tokens": 290547917.0, + "step": 11231 + }, + { + "epoch": 1.233472435756644, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3908963203430176, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7015697360038757, + "num_tokens": 290575889.0, + "step": 11232 + }, + { + "epoch": 1.2335822534592575, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.644744396209717, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7243896722793579, + "num_tokens": 290597692.0, + "step": 11233 + }, + { + "epoch": 1.2336920711618713, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5042216777801514, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6920791268348694, + "num_tokens": 290624743.0, + "step": 11234 + }, + { + "epoch": 1.2338018888644848, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4142353534698486, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7024235129356384, + "num_tokens": 290651588.0, + "step": 11235 + }, + { + "epoch": 1.2339117065670986, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.389832019805908, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.7007386684417725, + "num_tokens": 290679796.0, + "step": 11236 + }, + { + "epoch": 1.2340215242697123, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.918696880340576, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7161839008331299, + "num_tokens": 290698190.0, + "step": 11237 + }, + { + "epoch": 1.2341313419723259, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.458115577697754, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7200583815574646, + "num_tokens": 290722165.0, + "step": 11238 + }, + { + "epoch": 1.2342411596749396, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.074732780456543, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6880266666412354, + "num_tokens": 290755251.0, + "step": 11239 + }, + { + "epoch": 1.2343509773775532, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2680468559265137, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7094616889953613, + "num_tokens": 290782938.0, + "step": 11240 + }, + { + "epoch": 1.234460795080167, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.109360456466675, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7239130139350891, + "num_tokens": 290814105.0, + "step": 11241 + }, + { + "epoch": 1.2345706127827807, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2994072437286377, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.728277862071991, + "num_tokens": 290841887.0, + "step": 11242 + }, + { + "epoch": 1.2346804304853942, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.29709792137146, + "learning_rate": 1e-06, + "loss": 1.141, + "mean_token_accuracy": 0.6648824214935303, + "num_tokens": 290872908.0, + "step": 11243 + }, + { + "epoch": 1.234790248188008, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.233085870742798, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7067917585372925, + "num_tokens": 290902229.0, + "step": 11244 + }, + { + "epoch": 1.2349000658906215, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.237384080886841, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6879338026046753, + "num_tokens": 290930274.0, + "step": 11245 + }, + { + "epoch": 1.2350098835932353, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3939368724823, + "learning_rate": 1e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.6783804893493652, + "num_tokens": 290957237.0, + "step": 11246 + }, + { + "epoch": 1.2351197012958488, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.39444899559021, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7168221473693848, + "num_tokens": 290982019.0, + "step": 11247 + }, + { + "epoch": 1.2352295189984626, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5097315311431885, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7261162400245667, + "num_tokens": 291004914.0, + "step": 11248 + }, + { + "epoch": 1.235339336701076, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2553582191467285, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7063985466957092, + "num_tokens": 291035370.0, + "step": 11249 + }, + { + "epoch": 1.2354491544036899, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3620080947875977, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6889365911483765, + "num_tokens": 291064938.0, + "step": 11250 + }, + { + "epoch": 1.2355589721063036, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.384418487548828, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.703926682472229, + "num_tokens": 291090672.0, + "step": 11251 + }, + { + "epoch": 1.2356687898089171, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5605692863464355, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7114829421043396, + "num_tokens": 291113233.0, + "step": 11252 + }, + { + "epoch": 1.235778607511531, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2699921131134033, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7121641635894775, + "num_tokens": 291142475.0, + "step": 11253 + }, + { + "epoch": 1.2358884252141444, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.312626600265503, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7079290747642517, + "num_tokens": 291169504.0, + "step": 11254 + }, + { + "epoch": 1.2359982429167582, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2295844554901123, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7055517435073853, + "num_tokens": 291202195.0, + "step": 11255 + }, + { + "epoch": 1.236108060619372, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4630770683288574, + "learning_rate": 1e-06, + "loss": 1.104, + "mean_token_accuracy": 0.6828663349151611, + "num_tokens": 291228469.0, + "step": 11256 + }, + { + "epoch": 1.2362178783219855, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3474175930023193, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6830663681030273, + "num_tokens": 291259022.0, + "step": 11257 + }, + { + "epoch": 1.2363276960245992, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.378941774368286, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7201653718948364, + "num_tokens": 291283767.0, + "step": 11258 + }, + { + "epoch": 1.2364375137272128, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.508110761642456, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7178477048873901, + "num_tokens": 291306139.0, + "step": 11259 + }, + { + "epoch": 1.2365473314298265, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1750478744506836, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7078851461410522, + "num_tokens": 291336887.0, + "step": 11260 + }, + { + "epoch": 1.23665714913244, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.419823169708252, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6944184899330139, + "num_tokens": 291363600.0, + "step": 11261 + }, + { + "epoch": 1.2367669668350538, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.334601402282715, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7150486707687378, + "num_tokens": 291389067.0, + "step": 11262 + }, + { + "epoch": 1.2368767845376674, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5845203399658203, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.686286449432373, + "num_tokens": 291413585.0, + "step": 11263 + }, + { + "epoch": 1.2369866022402811, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4712326526641846, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7280545830726624, + "num_tokens": 291434867.0, + "step": 11264 + }, + { + "epoch": 1.2370964199428949, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3848326206207275, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7135089039802551, + "num_tokens": 291461764.0, + "step": 11265 + }, + { + "epoch": 1.2372062376455084, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.300339698791504, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7008615732192993, + "num_tokens": 291488950.0, + "step": 11266 + }, + { + "epoch": 1.2373160553481222, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.327727794647217, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.6760643720626831, + "num_tokens": 291516738.0, + "step": 11267 + }, + { + "epoch": 1.2374258730507357, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.197911262512207, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.705377995967865, + "num_tokens": 291548428.0, + "step": 11268 + }, + { + "epoch": 1.2375356907533495, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4601283073425293, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7187808156013489, + "num_tokens": 291572552.0, + "step": 11269 + }, + { + "epoch": 1.2376455084559632, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3754642009735107, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.732433021068573, + "num_tokens": 291597888.0, + "step": 11270 + }, + { + "epoch": 1.2377553261585768, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4084079265594482, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7096289396286011, + "num_tokens": 291624458.0, + "step": 11271 + }, + { + "epoch": 1.2378651438611905, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4038140773773193, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7027690410614014, + "num_tokens": 291649258.0, + "step": 11272 + }, + { + "epoch": 1.237974961563804, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3937020301818848, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.729444682598114, + "num_tokens": 291673698.0, + "step": 11273 + }, + { + "epoch": 1.2380847792664178, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.475811004638672, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7152307033538818, + "num_tokens": 291697675.0, + "step": 11274 + }, + { + "epoch": 1.2381945969690313, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3109235763549805, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7019777297973633, + "num_tokens": 291724573.0, + "step": 11275 + }, + { + "epoch": 1.238304414671645, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.8932442665100098, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7247579097747803, + "num_tokens": 291743913.0, + "step": 11276 + }, + { + "epoch": 1.2384142323742586, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3642594814300537, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7256841063499451, + "num_tokens": 291768326.0, + "step": 11277 + }, + { + "epoch": 1.2385240500768724, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4850916862487793, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6993402242660522, + "num_tokens": 291796062.0, + "step": 11278 + }, + { + "epoch": 1.2386338677794861, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2653348445892334, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7101138830184937, + "num_tokens": 291825094.0, + "step": 11279 + }, + { + "epoch": 1.2387436854820997, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3940210342407227, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.710566520690918, + "num_tokens": 291851180.0, + "step": 11280 + }, + { + "epoch": 1.2388535031847134, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.385038137435913, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.721070408821106, + "num_tokens": 291875864.0, + "step": 11281 + }, + { + "epoch": 1.238963320887327, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3408737182617188, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7125590443611145, + "num_tokens": 291901757.0, + "step": 11282 + }, + { + "epoch": 1.2390731385899407, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.523339033126831, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7258894443511963, + "num_tokens": 291924418.0, + "step": 11283 + }, + { + "epoch": 1.2391829562925545, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.302300214767456, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7045310735702515, + "num_tokens": 291953676.0, + "step": 11284 + }, + { + "epoch": 1.239292773995168, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2799410820007324, + "learning_rate": 1e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.6802667379379272, + "num_tokens": 291982067.0, + "step": 11285 + }, + { + "epoch": 1.2394025916977818, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.637411594390869, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7176605463027954, + "num_tokens": 292002505.0, + "step": 11286 + }, + { + "epoch": 1.2395124094003953, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2684643268585205, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7085041999816895, + "num_tokens": 292030540.0, + "step": 11287 + }, + { + "epoch": 1.239622227103009, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4226748943328857, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6938316822052002, + "num_tokens": 292055186.0, + "step": 11288 + }, + { + "epoch": 1.2397320448056226, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.619676113128662, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7331615686416626, + "num_tokens": 292075507.0, + "step": 11289 + }, + { + "epoch": 1.2398418625082364, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3139595985412598, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6921420097351074, + "num_tokens": 292103532.0, + "step": 11290 + }, + { + "epoch": 1.23995168021085, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3013129234313965, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7249073386192322, + "num_tokens": 292131001.0, + "step": 11291 + }, + { + "epoch": 1.2400614979134637, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.21513032913208, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7103455066680908, + "num_tokens": 292159966.0, + "step": 11292 + }, + { + "epoch": 1.2401713156160774, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3381032943725586, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7205552458763123, + "num_tokens": 292185311.0, + "step": 11293 + }, + { + "epoch": 1.240281133318691, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.298967123031616, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.71891188621521, + "num_tokens": 292210456.0, + "step": 11294 + }, + { + "epoch": 1.2403909510213047, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2591915130615234, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7002049684524536, + "num_tokens": 292238954.0, + "step": 11295 + }, + { + "epoch": 1.2405007687239182, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1650424003601074, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7440273761749268, + "num_tokens": 292268336.0, + "step": 11296 + }, + { + "epoch": 1.240610586426532, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.523937702178955, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7262715697288513, + "num_tokens": 292290510.0, + "step": 11297 + }, + { + "epoch": 1.2407204041291455, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.258002758026123, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7247335314750671, + "num_tokens": 292319469.0, + "step": 11298 + }, + { + "epoch": 1.2408302218317593, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.459851026535034, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7046148777008057, + "num_tokens": 292345632.0, + "step": 11299 + }, + { + "epoch": 1.2409400395343728, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3214945793151855, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7019439339637756, + "num_tokens": 292372304.0, + "step": 11300 + }, + { + "epoch": 1.2410498572369866, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.406569004058838, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7426772117614746, + "num_tokens": 292395329.0, + "step": 11301 + }, + { + "epoch": 1.2411596749396003, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.425823450088501, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7142422199249268, + "num_tokens": 292420030.0, + "step": 11302 + }, + { + "epoch": 1.2412694926422139, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.374741792678833, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7039029598236084, + "num_tokens": 292444679.0, + "step": 11303 + }, + { + "epoch": 1.2413793103448276, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3195605278015137, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7132899761199951, + "num_tokens": 292471191.0, + "step": 11304 + }, + { + "epoch": 1.2414891280474412, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5754213333129883, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7128137350082397, + "num_tokens": 292493761.0, + "step": 11305 + }, + { + "epoch": 1.241598945750055, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.43930983543396, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7069019079208374, + "num_tokens": 292516844.0, + "step": 11306 + }, + { + "epoch": 1.2417087634526687, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.598928928375244, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6926007270812988, + "num_tokens": 292540030.0, + "step": 11307 + }, + { + "epoch": 1.2418185811552822, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.208359718322754, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7125331163406372, + "num_tokens": 292568982.0, + "step": 11308 + }, + { + "epoch": 1.241928398857896, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.7096920013427734, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7111437916755676, + "num_tokens": 292589148.0, + "step": 11309 + }, + { + "epoch": 1.2420382165605095, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3299524784088135, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6978508830070496, + "num_tokens": 292617921.0, + "step": 11310 + }, + { + "epoch": 1.2421480342631233, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3850629329681396, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.730927586555481, + "num_tokens": 292644301.0, + "step": 11311 + }, + { + "epoch": 1.2422578519657368, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.21419620513916, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7143856287002563, + "num_tokens": 292674415.0, + "step": 11312 + }, + { + "epoch": 1.2423676696683505, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4935195446014404, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7124972343444824, + "num_tokens": 292698202.0, + "step": 11313 + }, + { + "epoch": 1.242477487370964, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.25251841545105, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7080919742584229, + "num_tokens": 292727895.0, + "step": 11314 + }, + { + "epoch": 1.2425873050735778, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.6264212131500244, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6897646188735962, + "num_tokens": 292751726.0, + "step": 11315 + }, + { + "epoch": 1.2426971227761916, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.358313798904419, + "learning_rate": 1e-06, + "loss": 1.1288, + "mean_token_accuracy": 0.6718087792396545, + "num_tokens": 292782025.0, + "step": 11316 + }, + { + "epoch": 1.2428069404788051, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3862829208374023, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7092169523239136, + "num_tokens": 292809852.0, + "step": 11317 + }, + { + "epoch": 1.242916758181419, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1515355110168457, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6988106966018677, + "num_tokens": 292840054.0, + "step": 11318 + }, + { + "epoch": 1.2430265758840324, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.7924416065216064, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7384659647941589, + "num_tokens": 292858529.0, + "step": 11319 + }, + { + "epoch": 1.2431363935866462, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.211958408355713, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6992321610450745, + "num_tokens": 292888213.0, + "step": 11320 + }, + { + "epoch": 1.24324621128926, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.12314510345459, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.709786057472229, + "num_tokens": 292921633.0, + "step": 11321 + }, + { + "epoch": 1.2433560289918735, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.21437406539917, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7078558802604675, + "num_tokens": 292950474.0, + "step": 11322 + }, + { + "epoch": 1.2434658466944872, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.444302558898926, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6933112740516663, + "num_tokens": 292975140.0, + "step": 11323 + }, + { + "epoch": 1.2435756643971008, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2291555404663086, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7170957922935486, + "num_tokens": 293005180.0, + "step": 11324 + }, + { + "epoch": 1.2436854820997145, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4575726985931396, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7180565595626831, + "num_tokens": 293028606.0, + "step": 11325 + }, + { + "epoch": 1.243795299802328, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.577646493911743, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7244240045547485, + "num_tokens": 293049332.0, + "step": 11326 + }, + { + "epoch": 1.2439051175049418, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1384971141815186, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7096427083015442, + "num_tokens": 293079125.0, + "step": 11327 + }, + { + "epoch": 1.2440149352075554, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.342535972595215, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6915341019630432, + "num_tokens": 293106284.0, + "step": 11328 + }, + { + "epoch": 1.244124752910169, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.1167123317718506, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7175306081771851, + "num_tokens": 293137777.0, + "step": 11329 + }, + { + "epoch": 1.2442345706127829, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.826913356781006, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7334367036819458, + "num_tokens": 293157064.0, + "step": 11330 + }, + { + "epoch": 1.2443443883153964, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.494262218475342, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6892147064208984, + "num_tokens": 293181353.0, + "step": 11331 + }, + { + "epoch": 1.2444542060180102, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.218531370162964, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6983386278152466, + "num_tokens": 293211082.0, + "step": 11332 + }, + { + "epoch": 1.2445640237206237, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3338303565979004, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7201530933380127, + "num_tokens": 293238078.0, + "step": 11333 + }, + { + "epoch": 1.2446738414232374, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.9536805152893066, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7188656330108643, + "num_tokens": 293255608.0, + "step": 11334 + }, + { + "epoch": 1.2447836591258512, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2459089756011963, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7023658752441406, + "num_tokens": 293285103.0, + "step": 11335 + }, + { + "epoch": 1.2448934768284647, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4803457260131836, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.703027606010437, + "num_tokens": 293309605.0, + "step": 11336 + }, + { + "epoch": 1.2450032945310785, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3940744400024414, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6808963418006897, + "num_tokens": 293336245.0, + "step": 11337 + }, + { + "epoch": 1.245113112233692, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.458712100982666, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6968382596969604, + "num_tokens": 293361734.0, + "step": 11338 + }, + { + "epoch": 1.2452229299363058, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2118337154388428, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7174040675163269, + "num_tokens": 293389235.0, + "step": 11339 + }, + { + "epoch": 1.2453327476389193, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5895073413848877, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7089734077453613, + "num_tokens": 293410452.0, + "step": 11340 + }, + { + "epoch": 1.245442565341533, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3772454261779785, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7270247936248779, + "num_tokens": 293435361.0, + "step": 11341 + }, + { + "epoch": 1.2455523830441466, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.6083779335021973, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7294557094573975, + "num_tokens": 293457915.0, + "step": 11342 + }, + { + "epoch": 1.2456622007467604, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5895705223083496, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.738129734992981, + "num_tokens": 293478810.0, + "step": 11343 + }, + { + "epoch": 1.2457720184493741, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.462005138397217, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7245752215385437, + "num_tokens": 293502615.0, + "step": 11344 + }, + { + "epoch": 1.2458818361519877, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.452662229537964, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7146393656730652, + "num_tokens": 293527291.0, + "step": 11345 + }, + { + "epoch": 1.2459916538546014, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3427767753601074, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7276135683059692, + "num_tokens": 293554129.0, + "step": 11346 + }, + { + "epoch": 1.246101471557215, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3642361164093018, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.716417670249939, + "num_tokens": 293579967.0, + "step": 11347 + }, + { + "epoch": 1.2462112892598287, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2920494079589844, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7252269387245178, + "num_tokens": 293606901.0, + "step": 11348 + }, + { + "epoch": 1.2463211069624422, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3688716888427734, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6956850290298462, + "num_tokens": 293634345.0, + "step": 11349 + }, + { + "epoch": 1.246430924665056, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.36055588722229, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7033494710922241, + "num_tokens": 293660439.0, + "step": 11350 + }, + { + "epoch": 1.2465407423676698, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.40447998046875, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7275877594947815, + "num_tokens": 293685799.0, + "step": 11351 + }, + { + "epoch": 1.2466505600702833, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4271342754364014, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7232645153999329, + "num_tokens": 293710985.0, + "step": 11352 + }, + { + "epoch": 1.246760377772897, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.301262617111206, + "learning_rate": 1e-06, + "loss": 1.0981, + "mean_token_accuracy": 0.6800214648246765, + "num_tokens": 293742124.0, + "step": 11353 + }, + { + "epoch": 1.2468701954755106, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6880178451538086, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7067918181419373, + "num_tokens": 293764646.0, + "step": 11354 + }, + { + "epoch": 1.2469800131781243, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5277137756347656, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7196333408355713, + "num_tokens": 293788042.0, + "step": 11355 + }, + { + "epoch": 1.2470898308807379, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.663259983062744, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7262675762176514, + "num_tokens": 293811110.0, + "step": 11356 + }, + { + "epoch": 1.2471996485833516, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2900390625, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6924309730529785, + "num_tokens": 293839656.0, + "step": 11357 + }, + { + "epoch": 1.2473094662859654, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.507063865661621, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6869291663169861, + "num_tokens": 293864921.0, + "step": 11358 + }, + { + "epoch": 1.247419283988579, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.5031182765960693, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.7054222226142883, + "num_tokens": 293891997.0, + "step": 11359 + }, + { + "epoch": 1.2475291016911927, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.0909533500671387, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6898409128189087, + "num_tokens": 293928415.0, + "step": 11360 + }, + { + "epoch": 1.2476389193938062, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.037062644958496, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7045679092407227, + "num_tokens": 293964945.0, + "step": 11361 + }, + { + "epoch": 1.24774873709642, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2948169708251953, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6996882557868958, + "num_tokens": 293991970.0, + "step": 11362 + }, + { + "epoch": 1.2478585547990335, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.3422794342041016, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7112787365913391, + "num_tokens": 294018661.0, + "step": 11363 + }, + { + "epoch": 1.2479683725016473, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2961363792419434, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7417013645172119, + "num_tokens": 294046716.0, + "step": 11364 + }, + { + "epoch": 1.2480781902042608, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3303771018981934, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7398793697357178, + "num_tokens": 294071987.0, + "step": 11365 + }, + { + "epoch": 1.2481880079068746, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4520280361175537, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7329649925231934, + "num_tokens": 294095503.0, + "step": 11366 + }, + { + "epoch": 1.2482978256094883, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4360501766204834, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6891521215438843, + "num_tokens": 294122424.0, + "step": 11367 + }, + { + "epoch": 1.2484076433121019, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2621092796325684, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7090599536895752, + "num_tokens": 294150376.0, + "step": 11368 + }, + { + "epoch": 1.2485174610147156, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.2545018196105957, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.713178813457489, + "num_tokens": 294177393.0, + "step": 11369 + }, + { + "epoch": 1.2486272787173291, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.4727225303649902, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.7013555765151978, + "num_tokens": 294203511.0, + "step": 11370 + }, + { + "epoch": 1.248737096419943, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3284084796905518, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7072792053222656, + "num_tokens": 294232230.0, + "step": 11371 + }, + { + "epoch": 1.2488469141225567, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2604689598083496, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7349190711975098, + "num_tokens": 294261107.0, + "step": 11372 + }, + { + "epoch": 1.2489567318251702, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7047982215881348, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7204653024673462, + "num_tokens": 294283492.0, + "step": 11373 + }, + { + "epoch": 1.249066549527784, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.40520977973938, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7164222002029419, + "num_tokens": 294308887.0, + "step": 11374 + }, + { + "epoch": 1.2491763672303975, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4293205738067627, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7016617059707642, + "num_tokens": 294336422.0, + "step": 11375 + }, + { + "epoch": 1.2492861849330112, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.392937660217285, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6909602880477905, + "num_tokens": 294363697.0, + "step": 11376 + }, + { + "epoch": 1.2493960026356248, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.530280113220215, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7242047786712646, + "num_tokens": 294387983.0, + "step": 11377 + }, + { + "epoch": 1.2495058203382385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4368393421173096, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7010948657989502, + "num_tokens": 294414661.0, + "step": 11378 + }, + { + "epoch": 1.249615638040852, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2767179012298584, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7213757634162903, + "num_tokens": 294440692.0, + "step": 11379 + }, + { + "epoch": 1.2497254557434658, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4619314670562744, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7076089978218079, + "num_tokens": 294468741.0, + "step": 11380 + }, + { + "epoch": 1.2498352734460796, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.659741163253784, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7193434238433838, + "num_tokens": 294489488.0, + "step": 11381 + }, + { + "epoch": 1.2499450911486931, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.55229115486145, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.705231249332428, + "num_tokens": 294513430.0, + "step": 11382 + }, + { + "epoch": 1.2500549088513069, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.247319221496582, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7099181413650513, + "num_tokens": 294542188.0, + "step": 11383 + }, + { + "epoch": 1.2501647265539204, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5650742053985596, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7072092890739441, + "num_tokens": 294566002.0, + "step": 11384 + }, + { + "epoch": 1.2502745442565342, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.376434326171875, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7059948444366455, + "num_tokens": 294591308.0, + "step": 11385 + }, + { + "epoch": 1.250384361959148, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.646545886993408, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6937023997306824, + "num_tokens": 294615500.0, + "step": 11386 + }, + { + "epoch": 1.2504941796617615, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4912269115448, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7122969627380371, + "num_tokens": 294640524.0, + "step": 11387 + }, + { + "epoch": 1.2506039973643752, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.228257179260254, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7040079832077026, + "num_tokens": 294670595.0, + "step": 11388 + }, + { + "epoch": 1.2507138150669888, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3035266399383545, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6973007321357727, + "num_tokens": 294696095.0, + "step": 11389 + }, + { + "epoch": 1.2508236327696025, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3695175647735596, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7181313037872314, + "num_tokens": 294722676.0, + "step": 11390 + }, + { + "epoch": 1.250933450472216, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.204106569290161, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7098939418792725, + "num_tokens": 294752360.0, + "step": 11391 + }, + { + "epoch": 1.2510432681748298, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3445255756378174, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.7011125087738037, + "num_tokens": 294777603.0, + "step": 11392 + }, + { + "epoch": 1.2511530858774433, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2952420711517334, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7243164777755737, + "num_tokens": 294802529.0, + "step": 11393 + }, + { + "epoch": 1.251262903580057, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.448465585708618, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.692071795463562, + "num_tokens": 294827373.0, + "step": 11394 + }, + { + "epoch": 1.2513727212826709, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3179526329040527, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7126338481903076, + "num_tokens": 294855380.0, + "step": 11395 + }, + { + "epoch": 1.2514825389852844, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.458743095397949, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.741883397102356, + "num_tokens": 294877037.0, + "step": 11396 + }, + { + "epoch": 1.2515923566878981, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.54236102104187, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7056549787521362, + "num_tokens": 294901232.0, + "step": 11397 + }, + { + "epoch": 1.2517021743905117, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5477452278137207, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.723461389541626, + "num_tokens": 294921628.0, + "step": 11398 + }, + { + "epoch": 1.2518119920931254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4331512451171875, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7207343578338623, + "num_tokens": 294947376.0, + "step": 11399 + }, + { + "epoch": 1.2519218097957392, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.342284917831421, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7231553792953491, + "num_tokens": 294976345.0, + "step": 11400 + }, + { + "epoch": 1.2520316274983527, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4709243774414062, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7143305540084839, + "num_tokens": 295000527.0, + "step": 11401 + }, + { + "epoch": 1.2521414452009663, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7277984619140625, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6910102367401123, + "num_tokens": 295021857.0, + "step": 11402 + }, + { + "epoch": 1.25225126290358, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6243045330047607, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7398450374603271, + "num_tokens": 295045127.0, + "step": 11403 + }, + { + "epoch": 1.2523610806061938, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.436915636062622, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7298911809921265, + "num_tokens": 295070148.0, + "step": 11404 + }, + { + "epoch": 1.2524708983088073, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6900012493133545, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7087182402610779, + "num_tokens": 295089819.0, + "step": 11405 + }, + { + "epoch": 1.252580716011421, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8889660835266113, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7106676697731018, + "num_tokens": 295107843.0, + "step": 11406 + }, + { + "epoch": 1.2526905337140346, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.486055374145508, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7228416204452515, + "num_tokens": 295133757.0, + "step": 11407 + }, + { + "epoch": 1.2528003514166484, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4291555881500244, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6999006271362305, + "num_tokens": 295161640.0, + "step": 11408 + }, + { + "epoch": 1.2529101691192621, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2479825019836426, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7250310182571411, + "num_tokens": 295189968.0, + "step": 11409 + }, + { + "epoch": 1.2530199868218757, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.532984495162964, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.72636479139328, + "num_tokens": 295214500.0, + "step": 11410 + }, + { + "epoch": 1.2531298045244894, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.594266176223755, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7006407976150513, + "num_tokens": 295237895.0, + "step": 11411 + }, + { + "epoch": 1.253239622227103, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5214555263519287, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7214779853820801, + "num_tokens": 295260666.0, + "step": 11412 + }, + { + "epoch": 1.2533494399297167, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.18621826171875, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6963917016983032, + "num_tokens": 295293423.0, + "step": 11413 + }, + { + "epoch": 1.2534592576323305, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4194753170013428, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7147466540336609, + "num_tokens": 295318031.0, + "step": 11414 + }, + { + "epoch": 1.253569075334944, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3579909801483154, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6823650598526001, + "num_tokens": 295345764.0, + "step": 11415 + }, + { + "epoch": 1.2536788930375575, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4428415298461914, + "learning_rate": 1e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6822061538696289, + "num_tokens": 295369222.0, + "step": 11416 + }, + { + "epoch": 1.2537887107401713, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.572971820831299, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7356385588645935, + "num_tokens": 295390231.0, + "step": 11417 + }, + { + "epoch": 1.253898528442785, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8070247173309326, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7172390222549438, + "num_tokens": 295411044.0, + "step": 11418 + }, + { + "epoch": 1.2540083461453986, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.9010887145996094, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7250564098358154, + "num_tokens": 295432250.0, + "step": 11419 + }, + { + "epoch": 1.2541181638480123, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5308525562286377, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7114554047584534, + "num_tokens": 295462432.0, + "step": 11420 + }, + { + "epoch": 1.2542279815506259, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3655271530151367, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6934547424316406, + "num_tokens": 295490547.0, + "step": 11421 + }, + { + "epoch": 1.2543377992532396, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4952399730682373, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7101695537567139, + "num_tokens": 295513531.0, + "step": 11422 + }, + { + "epoch": 1.2544476169558534, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.526658296585083, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7308080792427063, + "num_tokens": 295535425.0, + "step": 11423 + }, + { + "epoch": 1.254557434658467, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.418097496032715, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7171328067779541, + "num_tokens": 295561283.0, + "step": 11424 + }, + { + "epoch": 1.2546672523610807, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.205623149871826, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7142317295074463, + "num_tokens": 295589515.0, + "step": 11425 + }, + { + "epoch": 1.2547770700636942, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4665040969848633, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.7010999917984009, + "num_tokens": 295615953.0, + "step": 11426 + }, + { + "epoch": 1.254886887766308, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.256258487701416, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7090706825256348, + "num_tokens": 295644770.0, + "step": 11427 + }, + { + "epoch": 1.2549967054689217, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.628160238265991, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7214376926422119, + "num_tokens": 295665904.0, + "step": 11428 + }, + { + "epoch": 1.2551065231715353, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1300485134124756, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7034158706665039, + "num_tokens": 295698262.0, + "step": 11429 + }, + { + "epoch": 1.2552163408741488, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.459775447845459, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6807693243026733, + "num_tokens": 295723447.0, + "step": 11430 + }, + { + "epoch": 1.2553261585767626, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.546609401702881, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7228318452835083, + "num_tokens": 295744698.0, + "step": 11431 + }, + { + "epoch": 1.2554359762793763, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 2.7288827896118164, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7215077877044678, + "num_tokens": 295764393.0, + "step": 11432 + }, + { + "epoch": 1.2555457939819898, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.434990406036377, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7057863473892212, + "num_tokens": 295790134.0, + "step": 11433 + }, + { + "epoch": 1.2556556116846036, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6284382343292236, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7132753133773804, + "num_tokens": 295811072.0, + "step": 11434 + }, + { + "epoch": 1.2557654293872171, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3186609745025635, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7380236387252808, + "num_tokens": 295837017.0, + "step": 11435 + }, + { + "epoch": 1.255875247089831, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.680156707763672, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7095896005630493, + "num_tokens": 295859040.0, + "step": 11436 + }, + { + "epoch": 1.2559850647924446, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8815090656280518, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7389338612556458, + "num_tokens": 295875491.0, + "step": 11437 + }, + { + "epoch": 1.2560948824950582, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7325022220611572, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7268936038017273, + "num_tokens": 295897898.0, + "step": 11438 + }, + { + "epoch": 1.256204700197672, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.213416576385498, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6907726526260376, + "num_tokens": 295928789.0, + "step": 11439 + }, + { + "epoch": 1.2563145179002855, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4327304363250732, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.713354766368866, + "num_tokens": 295953425.0, + "step": 11440 + }, + { + "epoch": 1.2564243356028992, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2239835262298584, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.69145667552948, + "num_tokens": 295982609.0, + "step": 11441 + }, + { + "epoch": 1.2565341533055128, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.501671552658081, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6836271286010742, + "num_tokens": 296008120.0, + "step": 11442 + }, + { + "epoch": 1.2566439710081265, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.9266505241394043, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7257025837898254, + "num_tokens": 296025613.0, + "step": 11443 + }, + { + "epoch": 1.25675378871074, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2433433532714844, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7068973183631897, + "num_tokens": 296053874.0, + "step": 11444 + }, + { + "epoch": 1.2568636064133538, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8071937561035156, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7243561744689941, + "num_tokens": 296074215.0, + "step": 11445 + }, + { + "epoch": 1.2569734241159676, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.320706844329834, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6995736956596375, + "num_tokens": 296102474.0, + "step": 11446 + }, + { + "epoch": 1.257083241818581, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.286752462387085, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.688734769821167, + "num_tokens": 296131694.0, + "step": 11447 + }, + { + "epoch": 1.2571930595211949, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5213160514831543, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7310010194778442, + "num_tokens": 296153126.0, + "step": 11448 + }, + { + "epoch": 1.2573028772238084, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.544386148452759, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.693912148475647, + "num_tokens": 296179671.0, + "step": 11449 + }, + { + "epoch": 1.2574126949264222, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5534827709198, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6954085826873779, + "num_tokens": 296204581.0, + "step": 11450 + }, + { + "epoch": 1.257522512629036, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.819094181060791, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7111045122146606, + "num_tokens": 296224412.0, + "step": 11451 + }, + { + "epoch": 1.2576323303316495, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6902382373809814, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.707599401473999, + "num_tokens": 296246127.0, + "step": 11452 + }, + { + "epoch": 1.257742148034263, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5273807048797607, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.701921820640564, + "num_tokens": 296269949.0, + "step": 11453 + }, + { + "epoch": 1.2578519657368767, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.626277208328247, + "learning_rate": 1e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.685931384563446, + "num_tokens": 296291955.0, + "step": 11454 + }, + { + "epoch": 1.2579617834394905, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3866372108459473, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7148196697235107, + "num_tokens": 296319512.0, + "step": 11455 + }, + { + "epoch": 1.258071601142104, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5111122131347656, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7044304609298706, + "num_tokens": 296344515.0, + "step": 11456 + }, + { + "epoch": 1.2581814188447178, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3845083713531494, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.7015002369880676, + "num_tokens": 296371510.0, + "step": 11457 + }, + { + "epoch": 1.2582912365473313, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.425693988800049, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7179505825042725, + "num_tokens": 296395037.0, + "step": 11458 + }, + { + "epoch": 1.258401054249945, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.502748966217041, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7120672464370728, + "num_tokens": 296420556.0, + "step": 11459 + }, + { + "epoch": 1.2585108719525588, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.253988742828369, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6928290128707886, + "num_tokens": 296449227.0, + "step": 11460 + }, + { + "epoch": 1.2586206896551724, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.345160961151123, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7121185064315796, + "num_tokens": 296474535.0, + "step": 11461 + }, + { + "epoch": 1.2587305073577861, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5506534576416016, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6959432363510132, + "num_tokens": 296498255.0, + "step": 11462 + }, + { + "epoch": 1.2588403250603997, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.23199200630188, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7209398150444031, + "num_tokens": 296527472.0, + "step": 11463 + }, + { + "epoch": 1.2589501427630134, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5166738033294678, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7042224407196045, + "num_tokens": 296551741.0, + "step": 11464 + }, + { + "epoch": 1.2590599604656272, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2859995365142822, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7363024950027466, + "num_tokens": 296577777.0, + "step": 11465 + }, + { + "epoch": 1.2591697781682407, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.110805034637451, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7099868059158325, + "num_tokens": 296611448.0, + "step": 11466 + }, + { + "epoch": 1.2592795958708543, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.72090220451355, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.7066675424575806, + "num_tokens": 296638612.0, + "step": 11467 + }, + { + "epoch": 1.259389413573468, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.380861282348633, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7123366594314575, + "num_tokens": 296663290.0, + "step": 11468 + }, + { + "epoch": 1.2594992312760818, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.418912410736084, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7246359586715698, + "num_tokens": 296688612.0, + "step": 11469 + }, + { + "epoch": 1.2596090489786953, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.399176597595215, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6849105954170227, + "num_tokens": 296718311.0, + "step": 11470 + }, + { + "epoch": 1.259718866681309, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5749802589416504, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.704658031463623, + "num_tokens": 296741384.0, + "step": 11471 + }, + { + "epoch": 1.2598286843839226, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4704275131225586, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7270011901855469, + "num_tokens": 296764291.0, + "step": 11472 + }, + { + "epoch": 1.2599385020865363, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3914620876312256, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7055133581161499, + "num_tokens": 296790654.0, + "step": 11473 + }, + { + "epoch": 1.26004831978915, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.134230375289917, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7211666107177734, + "num_tokens": 296821205.0, + "step": 11474 + }, + { + "epoch": 1.2601581374917636, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1197147369384766, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7231763601303101, + "num_tokens": 296849946.0, + "step": 11475 + }, + { + "epoch": 1.2602679551943774, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.293663740158081, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7073096036911011, + "num_tokens": 296879116.0, + "step": 11476 + }, + { + "epoch": 1.260377772896991, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.368528366088867, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6892985105514526, + "num_tokens": 296905714.0, + "step": 11477 + }, + { + "epoch": 1.2604875905996047, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.372225761413574, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7050505876541138, + "num_tokens": 296932215.0, + "step": 11478 + }, + { + "epoch": 1.2605974083022184, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.199949264526367, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7561824917793274, + "num_tokens": 296959360.0, + "step": 11479 + }, + { + "epoch": 1.260707226004832, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5772409439086914, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6974616646766663, + "num_tokens": 296981937.0, + "step": 11480 + }, + { + "epoch": 1.2608170437074455, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.9830615520477295, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.704477071762085, + "num_tokens": 297013783.0, + "step": 11481 + }, + { + "epoch": 1.2609268614100593, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2285075187683105, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6911559104919434, + "num_tokens": 297045639.0, + "step": 11482 + }, + { + "epoch": 1.261036679112673, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.279472589492798, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6977234482765198, + "num_tokens": 297074810.0, + "step": 11483 + }, + { + "epoch": 1.2611464968152866, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.380007266998291, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.731423556804657, + "num_tokens": 297100410.0, + "step": 11484 + }, + { + "epoch": 1.2612563145179003, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2093522548675537, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7171845436096191, + "num_tokens": 297132999.0, + "step": 11485 + }, + { + "epoch": 1.2613661322205139, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.51206636428833, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7128844857215881, + "num_tokens": 297157221.0, + "step": 11486 + }, + { + "epoch": 1.2614759499231276, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4741554260253906, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6971628069877625, + "num_tokens": 297183062.0, + "step": 11487 + }, + { + "epoch": 1.2615857676257414, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4229705333709717, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.724271297454834, + "num_tokens": 297206154.0, + "step": 11488 + }, + { + "epoch": 1.261695585328355, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5490775108337402, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7262760996818542, + "num_tokens": 297228651.0, + "step": 11489 + }, + { + "epoch": 1.2618054030309687, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2570748329162598, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7073992490768433, + "num_tokens": 297259028.0, + "step": 11490 + }, + { + "epoch": 1.2619152207335822, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3074679374694824, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7134939432144165, + "num_tokens": 297285857.0, + "step": 11491 + }, + { + "epoch": 1.262025038436196, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.547611713409424, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.687250018119812, + "num_tokens": 297312496.0, + "step": 11492 + }, + { + "epoch": 1.2621348561388097, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.386843204498291, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6929535865783691, + "num_tokens": 297338471.0, + "step": 11493 + }, + { + "epoch": 1.2622446738414232, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5443930625915527, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7176601886749268, + "num_tokens": 297363122.0, + "step": 11494 + }, + { + "epoch": 1.2623544915440368, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1807518005371094, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7032226324081421, + "num_tokens": 297393466.0, + "step": 11495 + }, + { + "epoch": 1.2624643092466505, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2882766723632812, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7125828266143799, + "num_tokens": 297422391.0, + "step": 11496 + }, + { + "epoch": 1.2625741269492643, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2998013496398926, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6881201267242432, + "num_tokens": 297451099.0, + "step": 11497 + }, + { + "epoch": 1.2626839446518778, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.46675968170166, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7000381946563721, + "num_tokens": 297476115.0, + "step": 11498 + }, + { + "epoch": 1.2627937623544916, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.651684522628784, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6947900652885437, + "num_tokens": 297498409.0, + "step": 11499 + }, + { + "epoch": 1.2629035800571051, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4923877716064453, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7241315841674805, + "num_tokens": 297521611.0, + "step": 11500 + }, + { + "epoch": 1.2630133977597189, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2698659896850586, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7253469228744507, + "num_tokens": 297548205.0, + "step": 11501 + }, + { + "epoch": 1.2631232154623326, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4648218154907227, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6894170641899109, + "num_tokens": 297573850.0, + "step": 11502 + }, + { + "epoch": 1.2632330331649462, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7594714164733887, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7155947685241699, + "num_tokens": 297593424.0, + "step": 11503 + }, + { + "epoch": 1.26334285086756, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4553489685058594, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6943804025650024, + "num_tokens": 297618308.0, + "step": 11504 + }, + { + "epoch": 1.2634526685701735, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5330004692077637, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7110673189163208, + "num_tokens": 297642742.0, + "step": 11505 + }, + { + "epoch": 1.2635624862727872, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.479640245437622, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7192400097846985, + "num_tokens": 297667547.0, + "step": 11506 + }, + { + "epoch": 1.2636723039754008, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4137301445007324, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7405173182487488, + "num_tokens": 297691845.0, + "step": 11507 + }, + { + "epoch": 1.2637821216780145, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.461108446121216, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7189058065414429, + "num_tokens": 297716930.0, + "step": 11508 + }, + { + "epoch": 1.263891939380628, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2314069271087646, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7010012865066528, + "num_tokens": 297747549.0, + "step": 11509 + }, + { + "epoch": 1.2640017570832418, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8783199787139893, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7251440286636353, + "num_tokens": 297766898.0, + "step": 11510 + }, + { + "epoch": 1.2641115747858556, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.485513687133789, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6976261734962463, + "num_tokens": 297791316.0, + "step": 11511 + }, + { + "epoch": 1.264221392488469, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7269747257232666, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7083495855331421, + "num_tokens": 297811952.0, + "step": 11512 + }, + { + "epoch": 1.2643312101910829, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6544225215911865, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7160463929176331, + "num_tokens": 297833174.0, + "step": 11513 + }, + { + "epoch": 1.2644410278936964, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7249388694763184, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7038053870201111, + "num_tokens": 297853684.0, + "step": 11514 + }, + { + "epoch": 1.2645508455963101, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.370382070541382, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6929214596748352, + "num_tokens": 297880080.0, + "step": 11515 + }, + { + "epoch": 1.264660663298924, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.305948257446289, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6940915584564209, + "num_tokens": 297909560.0, + "step": 11516 + }, + { + "epoch": 1.2647704810015374, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.474616527557373, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7082784175872803, + "num_tokens": 297932492.0, + "step": 11517 + }, + { + "epoch": 1.264880298704151, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2176101207733154, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.692826509475708, + "num_tokens": 297965319.0, + "step": 11518 + }, + { + "epoch": 1.2649901164067647, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.406674861907959, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6926258206367493, + "num_tokens": 297993903.0, + "step": 11519 + }, + { + "epoch": 1.2650999341093785, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4217560291290283, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.698055624961853, + "num_tokens": 298022800.0, + "step": 11520 + }, + { + "epoch": 1.265209751811992, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.256155490875244, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7255541086196899, + "num_tokens": 298052026.0, + "step": 11521 + }, + { + "epoch": 1.2653195695146058, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4226903915405273, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7210766077041626, + "num_tokens": 298076619.0, + "step": 11522 + }, + { + "epoch": 1.2654293872172193, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5053699016571045, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7173919677734375, + "num_tokens": 298100965.0, + "step": 11523 + }, + { + "epoch": 1.265539204919833, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6192803382873535, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6825379133224487, + "num_tokens": 298124809.0, + "step": 11524 + }, + { + "epoch": 1.2656490226224468, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.250566005706787, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6954383850097656, + "num_tokens": 298155550.0, + "step": 11525 + }, + { + "epoch": 1.2657588403250604, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.301572799682617, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.70253586769104, + "num_tokens": 298181998.0, + "step": 11526 + }, + { + "epoch": 1.2658686580276741, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.163810968399048, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7193925380706787, + "num_tokens": 298211803.0, + "step": 11527 + }, + { + "epoch": 1.2659784757302877, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.667032480239868, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.697812020778656, + "num_tokens": 298234110.0, + "step": 11528 + }, + { + "epoch": 1.2660882934329014, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.231614828109741, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6885125637054443, + "num_tokens": 298265890.0, + "step": 11529 + }, + { + "epoch": 1.2661981111355152, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4203832149505615, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7155110836029053, + "num_tokens": 298290478.0, + "step": 11530 + }, + { + "epoch": 1.2663079288381287, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.250502824783325, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7215665578842163, + "num_tokens": 298318961.0, + "step": 11531 + }, + { + "epoch": 1.2664177465407422, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.497087001800537, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7279064059257507, + "num_tokens": 298341171.0, + "step": 11532 + }, + { + "epoch": 1.266527564243356, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4190523624420166, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7207527756690979, + "num_tokens": 298365083.0, + "step": 11533 + }, + { + "epoch": 1.2666373819459698, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.117241859436035, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7029715180397034, + "num_tokens": 298396270.0, + "step": 11534 + }, + { + "epoch": 1.2667471996485833, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5712571144104004, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7032651901245117, + "num_tokens": 298420344.0, + "step": 11535 + }, + { + "epoch": 1.266857017351197, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1308796405792236, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7087098360061646, + "num_tokens": 298451386.0, + "step": 11536 + }, + { + "epoch": 1.2669668350538106, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6196773052215576, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7003819346427917, + "num_tokens": 298473767.0, + "step": 11537 + }, + { + "epoch": 1.2670766527564243, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6129214763641357, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.721318781375885, + "num_tokens": 298495330.0, + "step": 11538 + }, + { + "epoch": 1.267186470459038, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.337996006011963, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6971767544746399, + "num_tokens": 298524502.0, + "step": 11539 + }, + { + "epoch": 1.2672962881616516, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3628640174865723, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7069649696350098, + "num_tokens": 298552364.0, + "step": 11540 + }, + { + "epoch": 1.2674061058642654, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3501148223876953, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.707276463508606, + "num_tokens": 298578078.0, + "step": 11541 + }, + { + "epoch": 1.267515923566879, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.28300142288208, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.711516797542572, + "num_tokens": 298605107.0, + "step": 11542 + }, + { + "epoch": 1.2676257412694927, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6743719577789307, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7229846119880676, + "num_tokens": 298625913.0, + "step": 11543 + }, + { + "epoch": 1.2677355589721064, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2561020851135254, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6841880083084106, + "num_tokens": 298656895.0, + "step": 11544 + }, + { + "epoch": 1.26784537667472, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6685400009155273, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7094107866287231, + "num_tokens": 298680044.0, + "step": 11545 + }, + { + "epoch": 1.2679551943773335, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.432551145553589, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.700258731842041, + "num_tokens": 298707118.0, + "step": 11546 + }, + { + "epoch": 1.2680650120799473, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.306098461151123, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6972812414169312, + "num_tokens": 298731089.0, + "step": 11547 + }, + { + "epoch": 1.268174829782561, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.283902406692505, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7305960655212402, + "num_tokens": 298757668.0, + "step": 11548 + }, + { + "epoch": 1.2682846474851746, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1751458644866943, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7222189903259277, + "num_tokens": 298785147.0, + "step": 11549 + }, + { + "epoch": 1.2683944651877883, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3098015785217285, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7011715173721313, + "num_tokens": 298812858.0, + "step": 11550 + }, + { + "epoch": 1.2685042828904018, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.407562732696533, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7202205657958984, + "num_tokens": 298835848.0, + "step": 11551 + }, + { + "epoch": 1.2686141005930156, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.596846103668213, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7042375802993774, + "num_tokens": 298856536.0, + "step": 11552 + }, + { + "epoch": 1.2687239182956294, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3462822437286377, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7208580374717712, + "num_tokens": 298883334.0, + "step": 11553 + }, + { + "epoch": 1.268833735998243, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8053970336914062, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7359964847564697, + "num_tokens": 298902690.0, + "step": 11554 + }, + { + "epoch": 1.2689435537008567, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.11700177192688, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7014575004577637, + "num_tokens": 298935723.0, + "step": 11555 + }, + { + "epoch": 1.2690533714034702, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3915462493896484, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7094305157661438, + "num_tokens": 298961129.0, + "step": 11556 + }, + { + "epoch": 1.269163189106084, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1924190521240234, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7218025922775269, + "num_tokens": 298989359.0, + "step": 11557 + }, + { + "epoch": 1.2692730068086975, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.272383451461792, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7212498784065247, + "num_tokens": 299018271.0, + "step": 11558 + }, + { + "epoch": 1.2693828245113112, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5865254402160645, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.688459575176239, + "num_tokens": 299041732.0, + "step": 11559 + }, + { + "epoch": 1.2694926422139248, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4179904460906982, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7183525562286377, + "num_tokens": 299066729.0, + "step": 11560 + }, + { + "epoch": 1.2696024599165385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5715630054473877, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6996709108352661, + "num_tokens": 299088342.0, + "step": 11561 + }, + { + "epoch": 1.2697122776191523, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5748798847198486, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6904729604721069, + "num_tokens": 299113587.0, + "step": 11562 + }, + { + "epoch": 1.2698220953217658, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.410634756088257, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7027328610420227, + "num_tokens": 299140214.0, + "step": 11563 + }, + { + "epoch": 1.2699319130243796, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.108977794647217, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6967722177505493, + "num_tokens": 299174699.0, + "step": 11564 + }, + { + "epoch": 1.2700417307269931, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.080812692642212, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7223421335220337, + "num_tokens": 299206309.0, + "step": 11565 + }, + { + "epoch": 1.2701515484296069, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7093749046325684, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7182599306106567, + "num_tokens": 299225998.0, + "step": 11566 + }, + { + "epoch": 1.2702613661322206, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.159120798110962, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7079081535339355, + "num_tokens": 299258633.0, + "step": 11567 + }, + { + "epoch": 1.2703711838348342, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.698390245437622, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6999155282974243, + "num_tokens": 299280740.0, + "step": 11568 + }, + { + "epoch": 1.270481001537448, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.546672821044922, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7090719938278198, + "num_tokens": 299304369.0, + "step": 11569 + }, + { + "epoch": 1.2705908192400615, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3695220947265625, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6926836967468262, + "num_tokens": 299332152.0, + "step": 11570 + }, + { + "epoch": 1.2707006369426752, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3114089965820312, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6886130571365356, + "num_tokens": 299361670.0, + "step": 11571 + }, + { + "epoch": 1.2708104546452887, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3630428314208984, + "learning_rate": 1e-06, + "loss": 1.0899, + "mean_token_accuracy": 0.6881698369979858, + "num_tokens": 299389803.0, + "step": 11572 + }, + { + "epoch": 1.2709202723479025, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1571760177612305, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7071853280067444, + "num_tokens": 299420332.0, + "step": 11573 + }, + { + "epoch": 1.271030090050516, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4984703063964844, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.690008819103241, + "num_tokens": 299444266.0, + "step": 11574 + }, + { + "epoch": 1.2711399077531298, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.172744035720825, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.704007089138031, + "num_tokens": 299476084.0, + "step": 11575 + }, + { + "epoch": 1.2712497254557436, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4103217124938965, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7121309041976929, + "num_tokens": 299501733.0, + "step": 11576 + }, + { + "epoch": 1.271359543158357, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.45023775100708, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7022596597671509, + "num_tokens": 299524569.0, + "step": 11577 + }, + { + "epoch": 1.2714693608609708, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3980140686035156, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7371920347213745, + "num_tokens": 299546818.0, + "step": 11578 + }, + { + "epoch": 1.2715791785635844, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.584892749786377, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7089889049530029, + "num_tokens": 299568630.0, + "step": 11579 + }, + { + "epoch": 1.2716889962661981, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4252865314483643, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6969106197357178, + "num_tokens": 299595091.0, + "step": 11580 + }, + { + "epoch": 1.271798813968812, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2789838314056396, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7112751007080078, + "num_tokens": 299624124.0, + "step": 11581 + }, + { + "epoch": 1.2719086316714254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.437011480331421, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6925942897796631, + "num_tokens": 299649454.0, + "step": 11582 + }, + { + "epoch": 1.272018449374039, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6127238273620605, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6978317499160767, + "num_tokens": 299675046.0, + "step": 11583 + }, + { + "epoch": 1.2721282670766527, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.315394163131714, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6913304924964905, + "num_tokens": 299704160.0, + "step": 11584 + }, + { + "epoch": 1.2722380847792665, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.431083917617798, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6785182952880859, + "num_tokens": 299730361.0, + "step": 11585 + }, + { + "epoch": 1.27234790248188, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3639423847198486, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7097704410552979, + "num_tokens": 299757251.0, + "step": 11586 + }, + { + "epoch": 1.2724577201844938, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6989009380340576, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7241495847702026, + "num_tokens": 299777812.0, + "step": 11587 + }, + { + "epoch": 1.2725675378871073, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.562239408493042, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7383100986480713, + "num_tokens": 299799763.0, + "step": 11588 + }, + { + "epoch": 1.272677355589721, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.374124526977539, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7045242786407471, + "num_tokens": 299823944.0, + "step": 11589 + }, + { + "epoch": 1.2727871732923348, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.518923282623291, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7303656339645386, + "num_tokens": 299846384.0, + "step": 11590 + }, + { + "epoch": 1.2728969909949484, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2096657752990723, + "learning_rate": 1e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.6722453236579895, + "num_tokens": 299877965.0, + "step": 11591 + }, + { + "epoch": 1.273006808697562, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2637197971343994, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7218201160430908, + "num_tokens": 299907103.0, + "step": 11592 + }, + { + "epoch": 1.2731166264001756, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.204291820526123, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6981065273284912, + "num_tokens": 299939433.0, + "step": 11593 + }, + { + "epoch": 1.2732264441027894, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3206350803375244, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7359989881515503, + "num_tokens": 299965674.0, + "step": 11594 + }, + { + "epoch": 1.2733362618054032, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.583996295928955, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7018224000930786, + "num_tokens": 299988125.0, + "step": 11595 + }, + { + "epoch": 1.2734460795080167, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.687347173690796, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7316335439682007, + "num_tokens": 300007609.0, + "step": 11596 + }, + { + "epoch": 1.2735558972106302, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.173351287841797, + "learning_rate": 1e-06, + "loss": 1.1504, + "mean_token_accuracy": 0.667730450630188, + "num_tokens": 300042810.0, + "step": 11597 + }, + { + "epoch": 1.273665714913244, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5570085048675537, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7043803334236145, + "num_tokens": 300065000.0, + "step": 11598 + }, + { + "epoch": 1.2737755326158577, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.333240032196045, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7168480157852173, + "num_tokens": 300093292.0, + "step": 11599 + }, + { + "epoch": 1.2738853503184713, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.445876121520996, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7248101234436035, + "num_tokens": 300116894.0, + "step": 11600 + }, + { + "epoch": 1.273995168021085, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.669853925704956, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6975945234298706, + "num_tokens": 300138074.0, + "step": 11601 + }, + { + "epoch": 1.2741049857236986, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1980960369110107, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7195760011672974, + "num_tokens": 300167547.0, + "step": 11602 + }, + { + "epoch": 1.2742148034263123, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.304891347885132, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6995972394943237, + "num_tokens": 300197864.0, + "step": 11603 + }, + { + "epoch": 1.274324621128926, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.412838935852051, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6980928182601929, + "num_tokens": 300223913.0, + "step": 11604 + }, + { + "epoch": 1.2744344388315396, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.19317364692688, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7153805494308472, + "num_tokens": 300253707.0, + "step": 11605 + }, + { + "epoch": 1.2745442565341534, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4401895999908447, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7250092029571533, + "num_tokens": 300277603.0, + "step": 11606 + }, + { + "epoch": 1.274654074236767, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.121006727218628, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7072877883911133, + "num_tokens": 300308542.0, + "step": 11607 + }, + { + "epoch": 1.2747638919393807, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.9018397331237793, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7095925807952881, + "num_tokens": 300326984.0, + "step": 11608 + }, + { + "epoch": 1.2748737096419944, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2847161293029785, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7102020382881165, + "num_tokens": 300353479.0, + "step": 11609 + }, + { + "epoch": 1.274983527344608, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4515230655670166, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6901267170906067, + "num_tokens": 300377842.0, + "step": 11610 + }, + { + "epoch": 1.2750933450472215, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7438862323760986, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.71095871925354, + "num_tokens": 300399800.0, + "step": 11611 + }, + { + "epoch": 1.2752031627498353, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7816379070281982, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6923741102218628, + "num_tokens": 300420136.0, + "step": 11612 + }, + { + "epoch": 1.275312980452449, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.511955976486206, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.727387547492981, + "num_tokens": 300443520.0, + "step": 11613 + }, + { + "epoch": 1.2754227981550625, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4711039066314697, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7233352661132812, + "num_tokens": 300468267.0, + "step": 11614 + }, + { + "epoch": 1.2755326158576763, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3497109413146973, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7046484351158142, + "num_tokens": 300498103.0, + "step": 11615 + }, + { + "epoch": 1.2756424335602898, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.436467409133911, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7240822315216064, + "num_tokens": 300521745.0, + "step": 11616 + }, + { + "epoch": 1.2757522512629036, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3434689044952393, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7185947299003601, + "num_tokens": 300550409.0, + "step": 11617 + }, + { + "epoch": 1.2758620689655173, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7345399856567383, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7189829349517822, + "num_tokens": 300573697.0, + "step": 11618 + }, + { + "epoch": 1.2759718866681309, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6169931888580322, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7130592465400696, + "num_tokens": 300595692.0, + "step": 11619 + }, + { + "epoch": 1.2760817043707446, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.405117988586426, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7128652334213257, + "num_tokens": 300621855.0, + "step": 11620 + }, + { + "epoch": 1.2761915220733582, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.384150981903076, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7355509996414185, + "num_tokens": 300648529.0, + "step": 11621 + }, + { + "epoch": 1.276301339775972, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2301387786865234, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6991235613822937, + "num_tokens": 300680264.0, + "step": 11622 + }, + { + "epoch": 1.2764111574785855, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5994675159454346, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7055003046989441, + "num_tokens": 300703359.0, + "step": 11623 + }, + { + "epoch": 1.2765209751811992, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.9824070930480957, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7017951011657715, + "num_tokens": 300722624.0, + "step": 11624 + }, + { + "epoch": 1.2766307928838128, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3735885620117188, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7171387672424316, + "num_tokens": 300749852.0, + "step": 11625 + }, + { + "epoch": 1.2767406105864265, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5348496437072754, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7321557402610779, + "num_tokens": 300772727.0, + "step": 11626 + }, + { + "epoch": 1.2768504282890403, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6317505836486816, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7077492475509644, + "num_tokens": 300796476.0, + "step": 11627 + }, + { + "epoch": 1.2769602459916538, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.754498243331909, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7054932117462158, + "num_tokens": 300821612.0, + "step": 11628 + }, + { + "epoch": 1.2770700636942676, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3830716609954834, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7333605885505676, + "num_tokens": 300847114.0, + "step": 11629 + }, + { + "epoch": 1.277179881396881, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7084288597106934, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7081583142280579, + "num_tokens": 300867258.0, + "step": 11630 + }, + { + "epoch": 1.2772896990994949, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.267068386077881, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6811654567718506, + "num_tokens": 300895077.0, + "step": 11631 + }, + { + "epoch": 1.2773995168021086, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.236196756362915, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6982012987136841, + "num_tokens": 300926950.0, + "step": 11632 + }, + { + "epoch": 1.2775093345047221, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.268585681915283, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7149339914321899, + "num_tokens": 300954676.0, + "step": 11633 + }, + { + "epoch": 1.2776191522073357, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5677123069763184, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7128498554229736, + "num_tokens": 300979637.0, + "step": 11634 + }, + { + "epoch": 1.2777289699099494, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2683944702148438, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6985104084014893, + "num_tokens": 301008969.0, + "step": 11635 + }, + { + "epoch": 1.2778387876125632, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2595038414001465, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.695426881313324, + "num_tokens": 301039515.0, + "step": 11636 + }, + { + "epoch": 1.2779486053151767, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.27638840675354, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7170206904411316, + "num_tokens": 301066854.0, + "step": 11637 + }, + { + "epoch": 1.2780584230177905, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.185107946395874, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6957086324691772, + "num_tokens": 301097544.0, + "step": 11638 + }, + { + "epoch": 1.278168240720404, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.050382137298584, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7121027112007141, + "num_tokens": 301128404.0, + "step": 11639 + }, + { + "epoch": 1.2782780584230178, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5077710151672363, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7061270475387573, + "num_tokens": 301154182.0, + "step": 11640 + }, + { + "epoch": 1.2783878761256315, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8145601749420166, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7192010879516602, + "num_tokens": 301174498.0, + "step": 11641 + }, + { + "epoch": 1.278497693828245, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.397854804992676, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7125364542007446, + "num_tokens": 301198593.0, + "step": 11642 + }, + { + "epoch": 1.2786075115308588, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5445992946624756, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.726577639579773, + "num_tokens": 301219877.0, + "step": 11643 + }, + { + "epoch": 1.2787173292334724, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8131470680236816, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7344018220901489, + "num_tokens": 301237174.0, + "step": 11644 + }, + { + "epoch": 1.2788271469360861, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3601553440093994, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7165310382843018, + "num_tokens": 301264961.0, + "step": 11645 + }, + { + "epoch": 1.2789369646386999, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.416645050048828, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7136848568916321, + "num_tokens": 301288854.0, + "step": 11646 + }, + { + "epoch": 1.2790467823413134, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2935407161712646, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.714769721031189, + "num_tokens": 301317681.0, + "step": 11647 + }, + { + "epoch": 1.279156600043927, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.336954116821289, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7238733768463135, + "num_tokens": 301342976.0, + "step": 11648 + }, + { + "epoch": 1.2792664177465407, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1979196071624756, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7044063806533813, + "num_tokens": 301371968.0, + "step": 11649 + }, + { + "epoch": 1.2793762354491545, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2194690704345703, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.7026078104972839, + "num_tokens": 301399863.0, + "step": 11650 + }, + { + "epoch": 1.279486053151768, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6053121089935303, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7089324593544006, + "num_tokens": 301421566.0, + "step": 11651 + }, + { + "epoch": 1.2795958708543818, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.382747173309326, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7207546234130859, + "num_tokens": 301448732.0, + "step": 11652 + }, + { + "epoch": 1.2797056885569953, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.496154308319092, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6879507899284363, + "num_tokens": 301472964.0, + "step": 11653 + }, + { + "epoch": 1.279815506259609, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.44352388381958, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7113430500030518, + "num_tokens": 301499186.0, + "step": 11654 + }, + { + "epoch": 1.2799253239622228, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2404985427856445, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7122044563293457, + "num_tokens": 301526014.0, + "step": 11655 + }, + { + "epoch": 1.2800351416648363, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3978428840637207, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6980600357055664, + "num_tokens": 301551649.0, + "step": 11656 + }, + { + "epoch": 1.28014495936745, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.303889751434326, + "learning_rate": 1e-06, + "loss": 1.089, + "mean_token_accuracy": 0.6866260766983032, + "num_tokens": 301580729.0, + "step": 11657 + }, + { + "epoch": 1.2802547770700636, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2213826179504395, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6907526254653931, + "num_tokens": 301610803.0, + "step": 11658 + }, + { + "epoch": 1.2803645947726774, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4919991493225098, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7177046537399292, + "num_tokens": 301634563.0, + "step": 11659 + }, + { + "epoch": 1.2804744124752911, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5628442764282227, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7238619923591614, + "num_tokens": 301655487.0, + "step": 11660 + }, + { + "epoch": 1.2805842301779047, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1069962978363037, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6968311667442322, + "num_tokens": 301692870.0, + "step": 11661 + }, + { + "epoch": 1.2806940478805182, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.468689203262329, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6961847543716431, + "num_tokens": 301718774.0, + "step": 11662 + }, + { + "epoch": 1.280803865583132, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.217078924179077, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7314661741256714, + "num_tokens": 301745583.0, + "step": 11663 + }, + { + "epoch": 1.2809136832857457, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.494028091430664, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7105823755264282, + "num_tokens": 301766400.0, + "step": 11664 + }, + { + "epoch": 1.2810235009883593, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5973806381225586, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7244688272476196, + "num_tokens": 301788337.0, + "step": 11665 + }, + { + "epoch": 1.281133318690973, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5544862747192383, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7019131183624268, + "num_tokens": 301812213.0, + "step": 11666 + }, + { + "epoch": 1.2812431363935866, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.569673776626587, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7334977388381958, + "num_tokens": 301834284.0, + "step": 11667 + }, + { + "epoch": 1.2813529540962003, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1361613273620605, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.7007414102554321, + "num_tokens": 301866714.0, + "step": 11668 + }, + { + "epoch": 1.281462771798814, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2596676349639893, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7120550870895386, + "num_tokens": 301895602.0, + "step": 11669 + }, + { + "epoch": 1.2815725895014276, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4115958213806152, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7027098536491394, + "num_tokens": 301922347.0, + "step": 11670 + }, + { + "epoch": 1.2816824072040414, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2896459102630615, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7160006761550903, + "num_tokens": 301948200.0, + "step": 11671 + }, + { + "epoch": 1.281792224906655, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.509763479232788, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7088762521743774, + "num_tokens": 301972825.0, + "step": 11672 + }, + { + "epoch": 1.2819020426092687, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7430689334869385, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7214981317520142, + "num_tokens": 301992609.0, + "step": 11673 + }, + { + "epoch": 1.2820118603118824, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2903411388397217, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6994789838790894, + "num_tokens": 302019194.0, + "step": 11674 + }, + { + "epoch": 1.282121678014496, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.453104257583618, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6984125971794128, + "num_tokens": 302045526.0, + "step": 11675 + }, + { + "epoch": 1.2822314957171095, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4590137004852295, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7013770341873169, + "num_tokens": 302071242.0, + "step": 11676 + }, + { + "epoch": 1.2823413134197232, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3583271503448486, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6975324153900146, + "num_tokens": 302099235.0, + "step": 11677 + }, + { + "epoch": 1.282451131122337, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.9804575443267822, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7088544964790344, + "num_tokens": 302125416.0, + "step": 11678 + }, + { + "epoch": 1.2825609488249505, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2265565395355225, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.707904577255249, + "num_tokens": 302155002.0, + "step": 11679 + }, + { + "epoch": 1.2826707665275643, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 7.119180679321289, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7074894905090332, + "num_tokens": 302177355.0, + "step": 11680 + }, + { + "epoch": 1.2827805842301778, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5623369216918945, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7188246250152588, + "num_tokens": 302199821.0, + "step": 11681 + }, + { + "epoch": 1.2828904019327916, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3315670490264893, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7029627561569214, + "num_tokens": 302228048.0, + "step": 11682 + }, + { + "epoch": 1.2830002196354053, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5816047191619873, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7260410785675049, + "num_tokens": 302250998.0, + "step": 11683 + }, + { + "epoch": 1.2831100373380189, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.415654420852661, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7249568700790405, + "num_tokens": 302275876.0, + "step": 11684 + }, + { + "epoch": 1.2832198550406326, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2769908905029297, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7174816131591797, + "num_tokens": 302303707.0, + "step": 11685 + }, + { + "epoch": 1.2833296727432462, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2598941326141357, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7348708510398865, + "num_tokens": 302329954.0, + "step": 11686 + }, + { + "epoch": 1.28343949044586, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.743028163909912, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7165946960449219, + "num_tokens": 302352435.0, + "step": 11687 + }, + { + "epoch": 1.2835493081484735, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.099576711654663, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.717133641242981, + "num_tokens": 302383080.0, + "step": 11688 + }, + { + "epoch": 1.2836591258510872, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.389620780944824, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6905279159545898, + "num_tokens": 302407828.0, + "step": 11689 + }, + { + "epoch": 1.2837689435537007, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.45424485206604, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7092879414558411, + "num_tokens": 302434324.0, + "step": 11690 + }, + { + "epoch": 1.2838787612563145, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1207852363586426, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7093074917793274, + "num_tokens": 302466784.0, + "step": 11691 + }, + { + "epoch": 1.2839885789589283, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4541971683502197, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7298780679702759, + "num_tokens": 302490422.0, + "step": 11692 + }, + { + "epoch": 1.2840983966615418, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.200381278991699, + "learning_rate": 1e-06, + "loss": 1.1077, + "mean_token_accuracy": 0.6853675842285156, + "num_tokens": 302521864.0, + "step": 11693 + }, + { + "epoch": 1.2842082143641556, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.297959566116333, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7096325159072876, + "num_tokens": 302550059.0, + "step": 11694 + }, + { + "epoch": 1.284318032066769, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.499788522720337, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7165234088897705, + "num_tokens": 302572257.0, + "step": 11695 + }, + { + "epoch": 1.2844278497693828, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4008395671844482, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7105499505996704, + "num_tokens": 302597184.0, + "step": 11696 + }, + { + "epoch": 1.2845376674719966, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.318479061126709, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7155520915985107, + "num_tokens": 302625370.0, + "step": 11697 + }, + { + "epoch": 1.2846474851746101, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6599066257476807, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7046886682510376, + "num_tokens": 302647698.0, + "step": 11698 + }, + { + "epoch": 1.2847573028772237, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4129092693328857, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7161450386047363, + "num_tokens": 302675217.0, + "step": 11699 + }, + { + "epoch": 1.2848671205798374, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3942508697509766, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7040064334869385, + "num_tokens": 302701118.0, + "step": 11700 + }, + { + "epoch": 1.2849769382824512, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.732151985168457, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.698116660118103, + "num_tokens": 302721586.0, + "step": 11701 + }, + { + "epoch": 1.2850867559850647, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3991055488586426, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.709973156452179, + "num_tokens": 302749143.0, + "step": 11702 + }, + { + "epoch": 1.2851965736876785, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.546617269515991, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7133231163024902, + "num_tokens": 302771892.0, + "step": 11703 + }, + { + "epoch": 1.285306391390292, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4369349479675293, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7090308666229248, + "num_tokens": 302795399.0, + "step": 11704 + }, + { + "epoch": 1.2854162090929058, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.389037847518921, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7199933528900146, + "num_tokens": 302820330.0, + "step": 11705 + }, + { + "epoch": 1.2855260267955195, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4011354446411133, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6920293569564819, + "num_tokens": 302844470.0, + "step": 11706 + }, + { + "epoch": 1.285635844498133, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2258803844451904, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6837852001190186, + "num_tokens": 302875201.0, + "step": 11707 + }, + { + "epoch": 1.2857456622007468, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4559528827667236, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7212305068969727, + "num_tokens": 302900116.0, + "step": 11708 + }, + { + "epoch": 1.2858554799033604, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3862674236297607, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6910568475723267, + "num_tokens": 302926066.0, + "step": 11709 + }, + { + "epoch": 1.2859652976059741, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.400846004486084, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7100338935852051, + "num_tokens": 302951977.0, + "step": 11710 + }, + { + "epoch": 1.2860751153085879, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.493072032928467, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7071895599365234, + "num_tokens": 302976941.0, + "step": 11711 + }, + { + "epoch": 1.2861849330112014, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.532148838043213, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7144742012023926, + "num_tokens": 302999808.0, + "step": 11712 + }, + { + "epoch": 1.286294750713815, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1361546516418457, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.737589955329895, + "num_tokens": 303030124.0, + "step": 11713 + }, + { + "epoch": 1.2864045684164287, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1783690452575684, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7222520112991333, + "num_tokens": 303062520.0, + "step": 11714 + }, + { + "epoch": 1.2865143861190425, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.434934616088867, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7112923860549927, + "num_tokens": 303087211.0, + "step": 11715 + }, + { + "epoch": 1.286624203821656, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.336207151412964, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7152827382087708, + "num_tokens": 303114010.0, + "step": 11716 + }, + { + "epoch": 1.2867340215242697, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.944262742996216, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7283815145492554, + "num_tokens": 303132223.0, + "step": 11717 + }, + { + "epoch": 1.2868438392268833, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 6.988192081451416, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7173991203308105, + "num_tokens": 303154590.0, + "step": 11718 + }, + { + "epoch": 1.286953656929497, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2978060245513916, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6991304159164429, + "num_tokens": 303181959.0, + "step": 11719 + }, + { + "epoch": 1.2870634746321108, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4331655502319336, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7308996915817261, + "num_tokens": 303206582.0, + "step": 11720 + }, + { + "epoch": 1.2871732923347243, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5059635639190674, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7001581192016602, + "num_tokens": 303230746.0, + "step": 11721 + }, + { + "epoch": 1.287283110037338, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2712149620056152, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7370409965515137, + "num_tokens": 303256952.0, + "step": 11722 + }, + { + "epoch": 1.2873929277399516, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.728914260864258, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7275904417037964, + "num_tokens": 303276667.0, + "step": 11723 + }, + { + "epoch": 1.2875027454425654, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2591307163238525, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7269967794418335, + "num_tokens": 303302982.0, + "step": 11724 + }, + { + "epoch": 1.2876125631451791, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4614434242248535, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.7024238109588623, + "num_tokens": 303330334.0, + "step": 11725 + }, + { + "epoch": 1.2877223808477927, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.313350200653076, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6858933568000793, + "num_tokens": 303358251.0, + "step": 11726 + }, + { + "epoch": 1.2878321985504062, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3244991302490234, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7364352941513062, + "num_tokens": 303381982.0, + "step": 11727 + }, + { + "epoch": 1.28794201625302, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.680420398712158, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7245101928710938, + "num_tokens": 303401346.0, + "step": 11728 + }, + { + "epoch": 1.2880518339556337, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.34071683883667, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.702556848526001, + "num_tokens": 303426493.0, + "step": 11729 + }, + { + "epoch": 1.2881616516582473, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.230158805847168, + "learning_rate": 1e-06, + "loss": 1.121, + "mean_token_accuracy": 0.6751194000244141, + "num_tokens": 303458971.0, + "step": 11730 + }, + { + "epoch": 1.288271469360861, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.432691812515259, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7355812788009644, + "num_tokens": 303482654.0, + "step": 11731 + }, + { + "epoch": 1.2883812870634745, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1767773628234863, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7175785303115845, + "num_tokens": 303515731.0, + "step": 11732 + }, + { + "epoch": 1.2884911047660883, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3370521068573, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7272967100143433, + "num_tokens": 303540606.0, + "step": 11733 + }, + { + "epoch": 1.288600922468702, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5010805130004883, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7160381078720093, + "num_tokens": 303562877.0, + "step": 11734 + }, + { + "epoch": 1.2887107401713156, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.228595018386841, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6962883472442627, + "num_tokens": 303590329.0, + "step": 11735 + }, + { + "epoch": 1.2888205578739294, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.115584135055542, + "learning_rate": 1e-06, + "loss": 1.1224, + "mean_token_accuracy": 0.6693708300590515, + "num_tokens": 303622275.0, + "step": 11736 + }, + { + "epoch": 1.2889303755765429, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1195619106292725, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6974008679389954, + "num_tokens": 303652835.0, + "step": 11737 + }, + { + "epoch": 1.2890401932791566, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4826812744140625, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7121338844299316, + "num_tokens": 303677543.0, + "step": 11738 + }, + { + "epoch": 1.2891500109817704, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.354387044906616, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7065049409866333, + "num_tokens": 303705579.0, + "step": 11739 + }, + { + "epoch": 1.289259828684384, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.458997964859009, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7240613102912903, + "num_tokens": 303727515.0, + "step": 11740 + }, + { + "epoch": 1.2893696463869975, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.210937261581421, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7103075981140137, + "num_tokens": 303757209.0, + "step": 11741 + }, + { + "epoch": 1.2894794640896112, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.298488140106201, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.693375289440155, + "num_tokens": 303785195.0, + "step": 11742 + }, + { + "epoch": 1.289589281792225, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8479812145233154, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.70865797996521, + "num_tokens": 303804918.0, + "step": 11743 + }, + { + "epoch": 1.2896990994948385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4544122219085693, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7020265460014343, + "num_tokens": 303829718.0, + "step": 11744 + }, + { + "epoch": 1.2898089171974523, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.46577787399292, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7021257281303406, + "num_tokens": 303858466.0, + "step": 11745 + }, + { + "epoch": 1.2899187349000658, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.504044771194458, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7382388114929199, + "num_tokens": 303881926.0, + "step": 11746 + }, + { + "epoch": 1.2900285526026796, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2694387435913086, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7214936017990112, + "num_tokens": 303908711.0, + "step": 11747 + }, + { + "epoch": 1.2901383703052933, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.32267427444458, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6998489499092102, + "num_tokens": 303934977.0, + "step": 11748 + }, + { + "epoch": 1.2902481880079069, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2045724391937256, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.715844988822937, + "num_tokens": 303962166.0, + "step": 11749 + }, + { + "epoch": 1.2903580057105206, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.221541404724121, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7049554586410522, + "num_tokens": 303993221.0, + "step": 11750 + }, + { + "epoch": 1.2904678234131342, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6243019104003906, + "learning_rate": 1e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.689720630645752, + "num_tokens": 304018005.0, + "step": 11751 + }, + { + "epoch": 1.290577641115748, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4313831329345703, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7157765030860901, + "num_tokens": 304042860.0, + "step": 11752 + }, + { + "epoch": 1.2906874588183614, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3421943187713623, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7027779221534729, + "num_tokens": 304072245.0, + "step": 11753 + }, + { + "epoch": 1.2907972765209752, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4162557125091553, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.712044358253479, + "num_tokens": 304096862.0, + "step": 11754 + }, + { + "epoch": 1.2909070942235887, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.267141580581665, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7128480076789856, + "num_tokens": 304124158.0, + "step": 11755 + }, + { + "epoch": 1.2910169119262025, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.379484176635742, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.683952271938324, + "num_tokens": 304152468.0, + "step": 11756 + }, + { + "epoch": 1.2911267296288162, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.507667303085327, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7089542150497437, + "num_tokens": 304175812.0, + "step": 11757 + }, + { + "epoch": 1.2912365473314298, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.421196222305298, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7289227247238159, + "num_tokens": 304200028.0, + "step": 11758 + }, + { + "epoch": 1.2913463650340435, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6022799015045166, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7238554954528809, + "num_tokens": 304224531.0, + "step": 11759 + }, + { + "epoch": 1.291456182736657, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.148326873779297, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.686539351940155, + "num_tokens": 304256347.0, + "step": 11760 + }, + { + "epoch": 1.2915660004392708, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.406498432159424, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6954178810119629, + "num_tokens": 304281763.0, + "step": 11761 + }, + { + "epoch": 1.2916758181418846, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.583158016204834, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7476135492324829, + "num_tokens": 304304235.0, + "step": 11762 + }, + { + "epoch": 1.2917856358444981, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4923348426818848, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7186790704727173, + "num_tokens": 304329706.0, + "step": 11763 + }, + { + "epoch": 1.2918954535471117, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1709721088409424, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7248413562774658, + "num_tokens": 304360223.0, + "step": 11764 + }, + { + "epoch": 1.2920052712497254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4960310459136963, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7208935022354126, + "num_tokens": 304382198.0, + "step": 11765 + }, + { + "epoch": 1.2921150889523392, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1855483055114746, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7240747809410095, + "num_tokens": 304415819.0, + "step": 11766 + }, + { + "epoch": 1.2922249066549527, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3198628425598145, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6998536586761475, + "num_tokens": 304443269.0, + "step": 11767 + }, + { + "epoch": 1.2923347243575665, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5302176475524902, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6926102638244629, + "num_tokens": 304467737.0, + "step": 11768 + }, + { + "epoch": 1.29244454206018, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.128415107727051, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.698621392250061, + "num_tokens": 304498891.0, + "step": 11769 + }, + { + "epoch": 1.2925543597627938, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.702188491821289, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7108557820320129, + "num_tokens": 304521902.0, + "step": 11770 + }, + { + "epoch": 1.2926641774654075, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4622766971588135, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7129385471343994, + "num_tokens": 304546070.0, + "step": 11771 + }, + { + "epoch": 1.292773995168021, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.626173973083496, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7078583240509033, + "num_tokens": 304567669.0, + "step": 11772 + }, + { + "epoch": 1.2928838128706348, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3477232456207275, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7079046964645386, + "num_tokens": 304594245.0, + "step": 11773 + }, + { + "epoch": 1.2929936305732483, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4170773029327393, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7201005816459656, + "num_tokens": 304622361.0, + "step": 11774 + }, + { + "epoch": 1.293103448275862, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8217220306396484, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7493953704833984, + "num_tokens": 304641523.0, + "step": 11775 + }, + { + "epoch": 1.2932132659784759, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.347337007522583, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7223533391952515, + "num_tokens": 304666763.0, + "step": 11776 + }, + { + "epoch": 1.2933230836810894, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4216079711914062, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.704468846321106, + "num_tokens": 304691674.0, + "step": 11777 + }, + { + "epoch": 1.293432901383703, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5914101600646973, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7150829434394836, + "num_tokens": 304713565.0, + "step": 11778 + }, + { + "epoch": 1.2935427190863167, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3790061473846436, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7056310176849365, + "num_tokens": 304738147.0, + "step": 11779 + }, + { + "epoch": 1.2936525367889304, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.383025884628296, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6998293995857239, + "num_tokens": 304763544.0, + "step": 11780 + }, + { + "epoch": 1.293762354491544, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.213966131210327, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6925882697105408, + "num_tokens": 304793019.0, + "step": 11781 + }, + { + "epoch": 1.2938721721941577, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4913620948791504, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6982051730155945, + "num_tokens": 304821249.0, + "step": 11782 + }, + { + "epoch": 1.2939819898967713, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.244722604751587, + "learning_rate": 1e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.6899856925010681, + "num_tokens": 304849658.0, + "step": 11783 + }, + { + "epoch": 1.294091807599385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4964404106140137, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7115519046783447, + "num_tokens": 304872325.0, + "step": 11784 + }, + { + "epoch": 1.2942016253019988, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.396207094192505, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7375726699829102, + "num_tokens": 304896426.0, + "step": 11785 + }, + { + "epoch": 1.2943114430046123, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.490612506866455, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6942360401153564, + "num_tokens": 304920851.0, + "step": 11786 + }, + { + "epoch": 1.294421260707226, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.51157808303833, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7565675973892212, + "num_tokens": 304943334.0, + "step": 11787 + }, + { + "epoch": 1.2945310784098396, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.38611102104187, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6918752193450928, + "num_tokens": 304969500.0, + "step": 11788 + }, + { + "epoch": 1.2946408961124534, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.599943161010742, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.713961124420166, + "num_tokens": 304991110.0, + "step": 11789 + }, + { + "epoch": 1.2947507138150671, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5340869426727295, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7128643989562988, + "num_tokens": 305015955.0, + "step": 11790 + }, + { + "epoch": 1.2948605315176807, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.334909200668335, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7079744338989258, + "num_tokens": 305042520.0, + "step": 11791 + }, + { + "epoch": 1.2949703492202942, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.334529399871826, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7007254362106323, + "num_tokens": 305071471.0, + "step": 11792 + }, + { + "epoch": 1.295080166922908, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.393717050552368, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7410584688186646, + "num_tokens": 305095514.0, + "step": 11793 + }, + { + "epoch": 1.2951899846255217, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5331947803497314, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6994836926460266, + "num_tokens": 305117920.0, + "step": 11794 + }, + { + "epoch": 1.2952998023281352, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6105897426605225, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7393894195556641, + "num_tokens": 305139672.0, + "step": 11795 + }, + { + "epoch": 1.295409620030749, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4231951236724854, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7190440893173218, + "num_tokens": 305164213.0, + "step": 11796 + }, + { + "epoch": 1.2955194377333625, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.492870330810547, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7025097608566284, + "num_tokens": 305188419.0, + "step": 11797 + }, + { + "epoch": 1.2956292554359763, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2642223834991455, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7012270092964172, + "num_tokens": 305217696.0, + "step": 11798 + }, + { + "epoch": 1.29573907313859, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3324708938598633, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7122931480407715, + "num_tokens": 305244888.0, + "step": 11799 + }, + { + "epoch": 1.2958488908412036, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.253140687942505, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7422980666160583, + "num_tokens": 305271407.0, + "step": 11800 + }, + { + "epoch": 1.2959587085438173, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6336777210235596, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.710182785987854, + "num_tokens": 305293901.0, + "step": 11801 + }, + { + "epoch": 1.2960685262464309, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.36238694190979, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7094515562057495, + "num_tokens": 305319982.0, + "step": 11802 + }, + { + "epoch": 1.2961783439490446, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5327231884002686, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7126199007034302, + "num_tokens": 305341512.0, + "step": 11803 + }, + { + "epoch": 1.2962881616516582, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.316805124282837, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6954679489135742, + "num_tokens": 305369414.0, + "step": 11804 + }, + { + "epoch": 1.296397979354272, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5042247772216797, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7293145656585693, + "num_tokens": 305392887.0, + "step": 11805 + }, + { + "epoch": 1.2965077970568855, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1844844818115234, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6857166290283203, + "num_tokens": 305423865.0, + "step": 11806 + }, + { + "epoch": 1.2966176147594992, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3339345455169678, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6981987357139587, + "num_tokens": 305454469.0, + "step": 11807 + }, + { + "epoch": 1.296727432462113, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2249417304992676, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7104731798171997, + "num_tokens": 305483821.0, + "step": 11808 + }, + { + "epoch": 1.2968372501647265, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5534653663635254, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7259072065353394, + "num_tokens": 305506685.0, + "step": 11809 + }, + { + "epoch": 1.2969470678673403, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.467439889907837, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7256885766983032, + "num_tokens": 305530489.0, + "step": 11810 + }, + { + "epoch": 1.2970568855699538, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5333139896392822, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7227929830551147, + "num_tokens": 305552683.0, + "step": 11811 + }, + { + "epoch": 1.2971667032725676, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4545793533325195, + "learning_rate": 1e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7573341131210327, + "num_tokens": 305575005.0, + "step": 11812 + }, + { + "epoch": 1.2972765209751813, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.407670497894287, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.726443350315094, + "num_tokens": 305601169.0, + "step": 11813 + }, + { + "epoch": 1.2973863386777948, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.38136887550354, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7244182825088501, + "num_tokens": 305628213.0, + "step": 11814 + }, + { + "epoch": 1.2974961563804086, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.311776876449585, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7027774453163147, + "num_tokens": 305655998.0, + "step": 11815 + }, + { + "epoch": 1.2976059740830221, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.332953929901123, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7005563974380493, + "num_tokens": 305682178.0, + "step": 11816 + }, + { + "epoch": 1.297715791785636, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4963488578796387, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.7000417709350586, + "num_tokens": 305706363.0, + "step": 11817 + }, + { + "epoch": 1.2978256094882494, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1357834339141846, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7191904187202454, + "num_tokens": 305736478.0, + "step": 11818 + }, + { + "epoch": 1.2979354271908632, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3225488662719727, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7045235633850098, + "num_tokens": 305765312.0, + "step": 11819 + }, + { + "epoch": 1.2980452448934767, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.493638277053833, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7316272258758545, + "num_tokens": 305788731.0, + "step": 11820 + }, + { + "epoch": 1.2981550625960905, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.380220651626587, + "learning_rate": 1e-06, + "loss": 1.0923, + "mean_token_accuracy": 0.6871780753135681, + "num_tokens": 305814487.0, + "step": 11821 + }, + { + "epoch": 1.2982648802987042, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.336216688156128, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7296788096427917, + "num_tokens": 305840110.0, + "step": 11822 + }, + { + "epoch": 1.2983746980013178, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1200551986694336, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7143687009811401, + "num_tokens": 305870635.0, + "step": 11823 + }, + { + "epoch": 1.2984845157039315, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2335243225097656, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6980304718017578, + "num_tokens": 305900461.0, + "step": 11824 + }, + { + "epoch": 1.298594333406545, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.601188898086548, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7148223519325256, + "num_tokens": 305920842.0, + "step": 11825 + }, + { + "epoch": 1.2987041511091588, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3762619495391846, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6970034837722778, + "num_tokens": 305947118.0, + "step": 11826 + }, + { + "epoch": 1.2988139688117726, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2771010398864746, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7152646780014038, + "num_tokens": 305973592.0, + "step": 11827 + }, + { + "epoch": 1.2989237865143861, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.504302501678467, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7108124494552612, + "num_tokens": 306000339.0, + "step": 11828 + }, + { + "epoch": 1.2990336042169996, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.0935730934143066, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7037680149078369, + "num_tokens": 306018889.0, + "step": 11829 + }, + { + "epoch": 1.2991434219196134, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6262218952178955, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7145389914512634, + "num_tokens": 306041685.0, + "step": 11830 + }, + { + "epoch": 1.2992532396222272, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.256854772567749, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.7029037475585938, + "num_tokens": 306072866.0, + "step": 11831 + }, + { + "epoch": 1.2993630573248407, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.410499095916748, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7075645327568054, + "num_tokens": 306100711.0, + "step": 11832 + }, + { + "epoch": 1.2994728750274545, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.662479877471924, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6924588084220886, + "num_tokens": 306122537.0, + "step": 11833 + }, + { + "epoch": 1.299582692730068, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4911270141601562, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7135688066482544, + "num_tokens": 306145470.0, + "step": 11834 + }, + { + "epoch": 1.2996925104326817, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3839030265808105, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7012831568717957, + "num_tokens": 306171805.0, + "step": 11835 + }, + { + "epoch": 1.2998023281352955, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.359151840209961, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7230808734893799, + "num_tokens": 306196865.0, + "step": 11836 + }, + { + "epoch": 1.299912145837909, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3272578716278076, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7131116390228271, + "num_tokens": 306224739.0, + "step": 11837 + }, + { + "epoch": 1.3000219635405228, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.492327928543091, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6961326599121094, + "num_tokens": 306248787.0, + "step": 11838 + }, + { + "epoch": 1.3001317812431363, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3290297985076904, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7048542499542236, + "num_tokens": 306274373.0, + "step": 11839 + }, + { + "epoch": 1.30024159894575, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3195199966430664, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7103874683380127, + "num_tokens": 306299504.0, + "step": 11840 + }, + { + "epoch": 1.3003514166483638, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.194087028503418, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.7002683877944946, + "num_tokens": 306328299.0, + "step": 11841 + }, + { + "epoch": 1.3004612343509774, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4889252185821533, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7280857563018799, + "num_tokens": 306351122.0, + "step": 11842 + }, + { + "epoch": 1.300571052053591, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6160826683044434, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7252063751220703, + "num_tokens": 306371039.0, + "step": 11843 + }, + { + "epoch": 1.3006808697562047, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2949185371398926, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7061541080474854, + "num_tokens": 306398171.0, + "step": 11844 + }, + { + "epoch": 1.3007906874588184, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2209296226501465, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.708876371383667, + "num_tokens": 306426726.0, + "step": 11845 + }, + { + "epoch": 1.300900505161432, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.232438325881958, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6891094446182251, + "num_tokens": 306459056.0, + "step": 11846 + }, + { + "epoch": 1.3010103228640457, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1865406036376953, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6978312730789185, + "num_tokens": 306488203.0, + "step": 11847 + }, + { + "epoch": 1.3011201405666593, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.35365629196167, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7305643558502197, + "num_tokens": 306514586.0, + "step": 11848 + }, + { + "epoch": 1.301229958269273, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.492525577545166, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7262389659881592, + "num_tokens": 306541307.0, + "step": 11849 + }, + { + "epoch": 1.3013397759718868, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.299988269805908, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.720039427280426, + "num_tokens": 306568190.0, + "step": 11850 + }, + { + "epoch": 1.3014495936745003, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.187798500061035, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7143322229385376, + "num_tokens": 306598005.0, + "step": 11851 + }, + { + "epoch": 1.301559411377114, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.266986608505249, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7163946628570557, + "num_tokens": 306626348.0, + "step": 11852 + }, + { + "epoch": 1.3016692290797276, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6792190074920654, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7062767744064331, + "num_tokens": 306648604.0, + "step": 11853 + }, + { + "epoch": 1.3017790467823414, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2613611221313477, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6993488669395447, + "num_tokens": 306677033.0, + "step": 11854 + }, + { + "epoch": 1.301888864484955, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3539063930511475, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7178839445114136, + "num_tokens": 306704591.0, + "step": 11855 + }, + { + "epoch": 1.3019986821875686, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.776764392852783, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7219969034194946, + "num_tokens": 306722762.0, + "step": 11856 + }, + { + "epoch": 1.3021084998901822, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1501381397247314, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6964842081069946, + "num_tokens": 306755451.0, + "step": 11857 + }, + { + "epoch": 1.302218317592796, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5601630210876465, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7385139465332031, + "num_tokens": 306777213.0, + "step": 11858 + }, + { + "epoch": 1.3023281352954097, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.136746883392334, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6826602816581726, + "num_tokens": 306811483.0, + "step": 11859 + }, + { + "epoch": 1.3024379529980232, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4505865573883057, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7097219824790955, + "num_tokens": 306835767.0, + "step": 11860 + }, + { + "epoch": 1.302547770700637, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.482929229736328, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7258086800575256, + "num_tokens": 306858057.0, + "step": 11861 + }, + { + "epoch": 1.3026575884032505, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.693594217300415, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7286471128463745, + "num_tokens": 306881002.0, + "step": 11862 + }, + { + "epoch": 1.3027674061058643, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.436629295349121, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7040678262710571, + "num_tokens": 306907122.0, + "step": 11863 + }, + { + "epoch": 1.302877223808478, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4124135971069336, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7143335342407227, + "num_tokens": 306931095.0, + "step": 11864 + }, + { + "epoch": 1.3029870415110916, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4941511154174805, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7204316258430481, + "num_tokens": 306953333.0, + "step": 11865 + }, + { + "epoch": 1.3030968592137053, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3734772205352783, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7045238018035889, + "num_tokens": 306979310.0, + "step": 11866 + }, + { + "epoch": 1.3032066769163189, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4033470153808594, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7164030075073242, + "num_tokens": 307005402.0, + "step": 11867 + }, + { + "epoch": 1.3033164946189326, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4511373043060303, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7113828063011169, + "num_tokens": 307031279.0, + "step": 11868 + }, + { + "epoch": 1.3034263123215462, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6007795333862305, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7182366847991943, + "num_tokens": 307054553.0, + "step": 11869 + }, + { + "epoch": 1.30353613002416, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3854820728302, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6997498869895935, + "num_tokens": 307081607.0, + "step": 11870 + }, + { + "epoch": 1.3036459477267734, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3522262573242188, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.711137056350708, + "num_tokens": 307108952.0, + "step": 11871 + }, + { + "epoch": 1.3037557654293872, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7661685943603516, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7008346319198608, + "num_tokens": 307129054.0, + "step": 11872 + }, + { + "epoch": 1.303865583132001, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2072811126708984, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7355282306671143, + "num_tokens": 307155838.0, + "step": 11873 + }, + { + "epoch": 1.3039754008346145, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.362321615219116, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.696037232875824, + "num_tokens": 307182441.0, + "step": 11874 + }, + { + "epoch": 1.3040852185372283, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.758486747741699, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7190066576004028, + "num_tokens": 307204790.0, + "step": 11875 + }, + { + "epoch": 1.3041950362398418, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4036779403686523, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7028133869171143, + "num_tokens": 307230438.0, + "step": 11876 + }, + { + "epoch": 1.3043048539424555, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2495064735412598, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7001147866249084, + "num_tokens": 307262556.0, + "step": 11877 + }, + { + "epoch": 1.3044146716450693, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6935880184173584, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6957554221153259, + "num_tokens": 307283816.0, + "step": 11878 + }, + { + "epoch": 1.3045244893476828, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7923338413238525, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7033324241638184, + "num_tokens": 307304357.0, + "step": 11879 + }, + { + "epoch": 1.3046343070502964, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.408440589904785, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7047445774078369, + "num_tokens": 307329642.0, + "step": 11880 + }, + { + "epoch": 1.3047441247529101, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2846429347991943, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7177987694740295, + "num_tokens": 307357631.0, + "step": 11881 + }, + { + "epoch": 1.3048539424555239, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2944681644439697, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7078820466995239, + "num_tokens": 307384204.0, + "step": 11882 + }, + { + "epoch": 1.3049637601581374, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.323511838912964, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6932479739189148, + "num_tokens": 307411603.0, + "step": 11883 + }, + { + "epoch": 1.3050735778607512, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.297919273376465, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6777377128601074, + "num_tokens": 307440959.0, + "step": 11884 + }, + { + "epoch": 1.3051833955633647, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1676406860351562, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6906615495681763, + "num_tokens": 307472194.0, + "step": 11885 + }, + { + "epoch": 1.3052932132659785, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1815919876098633, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7120232582092285, + "num_tokens": 307502651.0, + "step": 11886 + }, + { + "epoch": 1.3054030309685922, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.595179319381714, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7197731733322144, + "num_tokens": 307525708.0, + "step": 11887 + }, + { + "epoch": 1.3055128486712058, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4601664543151855, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6940121650695801, + "num_tokens": 307551774.0, + "step": 11888 + }, + { + "epoch": 1.3056226663738195, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2758638858795166, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7196574211120605, + "num_tokens": 307577389.0, + "step": 11889 + }, + { + "epoch": 1.305732484076433, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2636632919311523, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7035578489303589, + "num_tokens": 307605984.0, + "step": 11890 + }, + { + "epoch": 1.3058423017790468, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.651545763015747, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7152422666549683, + "num_tokens": 307627950.0, + "step": 11891 + }, + { + "epoch": 1.3059521194816606, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4042420387268066, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7082371711730957, + "num_tokens": 307653276.0, + "step": 11892 + }, + { + "epoch": 1.306061937184274, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.227285385131836, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6978063583374023, + "num_tokens": 307681643.0, + "step": 11893 + }, + { + "epoch": 1.3061717548868876, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2845349311828613, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7239881157875061, + "num_tokens": 307707993.0, + "step": 11894 + }, + { + "epoch": 1.3062815725895014, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.53251314163208, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7030006647109985, + "num_tokens": 307730284.0, + "step": 11895 + }, + { + "epoch": 1.3063913902921152, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7343435287475586, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7269526720046997, + "num_tokens": 307750876.0, + "step": 11896 + }, + { + "epoch": 1.3065012079947287, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.628488779067993, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7304575443267822, + "num_tokens": 307771352.0, + "step": 11897 + }, + { + "epoch": 1.3066110256973424, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.49169659614563, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6895772814750671, + "num_tokens": 307799490.0, + "step": 11898 + }, + { + "epoch": 1.306720843399956, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6285948753356934, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7337407469749451, + "num_tokens": 307822092.0, + "step": 11899 + }, + { + "epoch": 1.3068306611025697, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3501462936401367, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7110673189163208, + "num_tokens": 307847578.0, + "step": 11900 + }, + { + "epoch": 1.3069404788051835, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.148371934890747, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7019338607788086, + "num_tokens": 307879184.0, + "step": 11901 + }, + { + "epoch": 1.307050296507797, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5819342136383057, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6980640888214111, + "num_tokens": 307902946.0, + "step": 11902 + }, + { + "epoch": 1.3071601142104108, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.677189588546753, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6925204992294312, + "num_tokens": 307925745.0, + "step": 11903 + }, + { + "epoch": 1.3072699319130243, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5251412391662598, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6960111856460571, + "num_tokens": 307950738.0, + "step": 11904 + }, + { + "epoch": 1.307379749615638, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1944613456726074, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7002268433570862, + "num_tokens": 307980167.0, + "step": 11905 + }, + { + "epoch": 1.3074895673182518, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.601982593536377, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7124691605567932, + "num_tokens": 308003106.0, + "step": 11906 + }, + { + "epoch": 1.3075993850208654, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2727036476135254, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7158790826797485, + "num_tokens": 308029924.0, + "step": 11907 + }, + { + "epoch": 1.307709202723479, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5058014392852783, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6977880597114563, + "num_tokens": 308053551.0, + "step": 11908 + }, + { + "epoch": 1.3078190204260927, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.624929666519165, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7176271677017212, + "num_tokens": 308075839.0, + "step": 11909 + }, + { + "epoch": 1.3079288381287064, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.589752197265625, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7349926233291626, + "num_tokens": 308098003.0, + "step": 11910 + }, + { + "epoch": 1.30803865583132, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.16387677192688, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6992488503456116, + "num_tokens": 308131484.0, + "step": 11911 + }, + { + "epoch": 1.3081484735339337, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.547189235687256, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7143599987030029, + "num_tokens": 308152707.0, + "step": 11912 + }, + { + "epoch": 1.3082582912365472, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.285733461380005, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7101256847381592, + "num_tokens": 308183132.0, + "step": 11913 + }, + { + "epoch": 1.308368108939161, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8118057250976562, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7386913895606995, + "num_tokens": 308201717.0, + "step": 11914 + }, + { + "epoch": 1.3084779266417748, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.8742849826812744, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7140437364578247, + "num_tokens": 308221777.0, + "step": 11915 + }, + { + "epoch": 1.3085877443443883, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2229113578796387, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7038134336471558, + "num_tokens": 308252221.0, + "step": 11916 + }, + { + "epoch": 1.308697562047002, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5715622901916504, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7193589210510254, + "num_tokens": 308274420.0, + "step": 11917 + }, + { + "epoch": 1.3088073797496156, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.041562795639038, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6843135952949524, + "num_tokens": 308309469.0, + "step": 11918 + }, + { + "epoch": 1.3089171974522293, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.942023992538452, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7275997400283813, + "num_tokens": 308334493.0, + "step": 11919 + }, + { + "epoch": 1.309027015154843, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3299920558929443, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7048977613449097, + "num_tokens": 308362169.0, + "step": 11920 + }, + { + "epoch": 1.3091368328574566, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.628527879714966, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7236025333404541, + "num_tokens": 308383792.0, + "step": 11921 + }, + { + "epoch": 1.3092466505600702, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.534412145614624, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7125999927520752, + "num_tokens": 308408941.0, + "step": 11922 + }, + { + "epoch": 1.309356468262684, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3941307067871094, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6941919922828674, + "num_tokens": 308437407.0, + "step": 11923 + }, + { + "epoch": 1.3094662859652977, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3518424034118652, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6992462277412415, + "num_tokens": 308464422.0, + "step": 11924 + }, + { + "epoch": 1.3095761036679112, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.301623821258545, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6872389316558838, + "num_tokens": 308493901.0, + "step": 11925 + }, + { + "epoch": 1.309685921370525, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.104496955871582, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7149004936218262, + "num_tokens": 308527056.0, + "step": 11926 + }, + { + "epoch": 1.3097957390731385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1898725032806396, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7086461782455444, + "num_tokens": 308559454.0, + "step": 11927 + }, + { + "epoch": 1.3099055567757523, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4857730865478516, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7154460549354553, + "num_tokens": 308582588.0, + "step": 11928 + }, + { + "epoch": 1.310015374478366, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.394716501235962, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.704863429069519, + "num_tokens": 308607198.0, + "step": 11929 + }, + { + "epoch": 1.3101251921809796, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3517603874206543, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.705585241317749, + "num_tokens": 308633119.0, + "step": 11930 + }, + { + "epoch": 1.3102350098835933, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3688387870788574, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7034145593643188, + "num_tokens": 308658039.0, + "step": 11931 + }, + { + "epoch": 1.3103448275862069, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.493406295776367, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7340269088745117, + "num_tokens": 308678716.0, + "step": 11932 + }, + { + "epoch": 1.3104546452888206, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.213257312774658, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7048182487487793, + "num_tokens": 308708883.0, + "step": 11933 + }, + { + "epoch": 1.3105644629914341, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3518433570861816, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7190173864364624, + "num_tokens": 308733671.0, + "step": 11934 + }, + { + "epoch": 1.310674280694048, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3248801231384277, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.7027515172958374, + "num_tokens": 308760280.0, + "step": 11935 + }, + { + "epoch": 1.3107840983966614, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6007580757141113, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7259984016418457, + "num_tokens": 308780609.0, + "step": 11936 + }, + { + "epoch": 1.3108939160992752, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.414752960205078, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6952832937240601, + "num_tokens": 308804795.0, + "step": 11937 + }, + { + "epoch": 1.311003733801889, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7042362689971924, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7191580533981323, + "num_tokens": 308827372.0, + "step": 11938 + }, + { + "epoch": 1.3111135515045025, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.469045877456665, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7210030555725098, + "num_tokens": 308851157.0, + "step": 11939 + }, + { + "epoch": 1.3112233692071162, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4932444095611572, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7122907638549805, + "num_tokens": 308876515.0, + "step": 11940 + }, + { + "epoch": 1.3113331869097298, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.679370164871216, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.694451093673706, + "num_tokens": 308898561.0, + "step": 11941 + }, + { + "epoch": 1.3114430046123435, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1648313999176025, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7079595327377319, + "num_tokens": 308928792.0, + "step": 11942 + }, + { + "epoch": 1.3115528223149573, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.25300669670105, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7287989854812622, + "num_tokens": 308955868.0, + "step": 11943 + }, + { + "epoch": 1.3116626400175708, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4600887298583984, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7095786333084106, + "num_tokens": 308979800.0, + "step": 11944 + }, + { + "epoch": 1.3117724577201844, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.276059150695801, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7017918825149536, + "num_tokens": 309009514.0, + "step": 11945 + }, + { + "epoch": 1.3118822754227981, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7056796550750732, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7119985818862915, + "num_tokens": 309030237.0, + "step": 11946 + }, + { + "epoch": 1.3119920931254119, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.298647880554199, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7224102020263672, + "num_tokens": 309058352.0, + "step": 11947 + }, + { + "epoch": 1.3121019108280254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.764833688735962, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.711202085018158, + "num_tokens": 309078411.0, + "step": 11948 + }, + { + "epoch": 1.3122117285306392, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5463101863861084, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.729606032371521, + "num_tokens": 309099362.0, + "step": 11949 + }, + { + "epoch": 1.3123215462332527, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7295548915863037, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7294889688491821, + "num_tokens": 309120392.0, + "step": 11950 + }, + { + "epoch": 1.3124313639358665, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.213087797164917, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7190901041030884, + "num_tokens": 309149883.0, + "step": 11951 + }, + { + "epoch": 1.3125411816384802, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.380133867263794, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7069059610366821, + "num_tokens": 309176404.0, + "step": 11952 + }, + { + "epoch": 1.3126509993410937, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.114278793334961, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6951628923416138, + "num_tokens": 309204974.0, + "step": 11953 + }, + { + "epoch": 1.3127608170437075, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.250359535217285, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.713154673576355, + "num_tokens": 309233538.0, + "step": 11954 + }, + { + "epoch": 1.312870634746321, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1358931064605713, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.7000741958618164, + "num_tokens": 309265874.0, + "step": 11955 + }, + { + "epoch": 1.3129804524489348, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.4332516193389893, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6855484247207642, + "num_tokens": 309292134.0, + "step": 11956 + }, + { + "epoch": 1.3130902701515486, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5423052310943604, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7322921752929688, + "num_tokens": 309314202.0, + "step": 11957 + }, + { + "epoch": 1.313200087854162, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.1507887840270996, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.711315929889679, + "num_tokens": 309345545.0, + "step": 11958 + }, + { + "epoch": 1.3133099055567756, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.737401247024536, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6976025104522705, + "num_tokens": 309368933.0, + "step": 11959 + }, + { + "epoch": 1.3134197232593894, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.777547597885132, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7097185850143433, + "num_tokens": 309395945.0, + "step": 11960 + }, + { + "epoch": 1.3135295409620031, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.470630407333374, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6964272260665894, + "num_tokens": 309421344.0, + "step": 11961 + }, + { + "epoch": 1.3136393586646167, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.640629529953003, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7069798707962036, + "num_tokens": 309444483.0, + "step": 11962 + }, + { + "epoch": 1.3137491763672304, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5186080932617188, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7123637199401855, + "num_tokens": 309468167.0, + "step": 11963 + }, + { + "epoch": 1.313858994069844, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1624655723571777, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6976286172866821, + "num_tokens": 309503149.0, + "step": 11964 + }, + { + "epoch": 1.3139688117724577, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3941118717193604, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7278265953063965, + "num_tokens": 309528160.0, + "step": 11965 + }, + { + "epoch": 1.3140786294750715, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2308552265167236, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7190338373184204, + "num_tokens": 309556572.0, + "step": 11966 + }, + { + "epoch": 1.314188447177685, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4139912128448486, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7163281440734863, + "num_tokens": 309582611.0, + "step": 11967 + }, + { + "epoch": 1.3142982648802988, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3235504627227783, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7312542200088501, + "num_tokens": 309609761.0, + "step": 11968 + }, + { + "epoch": 1.3144080825829123, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3340203762054443, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7091069221496582, + "num_tokens": 309635992.0, + "step": 11969 + }, + { + "epoch": 1.314517900285526, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6102283000946045, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7095245122909546, + "num_tokens": 309659002.0, + "step": 11970 + }, + { + "epoch": 1.3146277179881398, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.343630075454712, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.700453519821167, + "num_tokens": 309689015.0, + "step": 11971 + }, + { + "epoch": 1.3147375356907534, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4008140563964844, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.686073899269104, + "num_tokens": 309715038.0, + "step": 11972 + }, + { + "epoch": 1.314847353393367, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5801892280578613, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7038336992263794, + "num_tokens": 309739071.0, + "step": 11973 + }, + { + "epoch": 1.3149571710959806, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.34059739112854, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6846271753311157, + "num_tokens": 309769186.0, + "step": 11974 + }, + { + "epoch": 1.3150669887985944, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.243685483932495, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7104376554489136, + "num_tokens": 309800157.0, + "step": 11975 + }, + { + "epoch": 1.315176806501208, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3652215003967285, + "learning_rate": 1e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6868132948875427, + "num_tokens": 309827522.0, + "step": 11976 + }, + { + "epoch": 1.3152866242038217, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2135770320892334, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6869587898254395, + "num_tokens": 309855405.0, + "step": 11977 + }, + { + "epoch": 1.3153964419064352, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1549646854400635, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7092424631118774, + "num_tokens": 309885220.0, + "step": 11978 + }, + { + "epoch": 1.315506259609049, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.481694459915161, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7197113037109375, + "num_tokens": 309912524.0, + "step": 11979 + }, + { + "epoch": 1.3156160773116627, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2021143436431885, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6804171800613403, + "num_tokens": 309944880.0, + "step": 11980 + }, + { + "epoch": 1.3157258950142763, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.570059061050415, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6990566253662109, + "num_tokens": 309971327.0, + "step": 11981 + }, + { + "epoch": 1.31583571271689, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6239635944366455, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6961995363235474, + "num_tokens": 309996491.0, + "step": 11982 + }, + { + "epoch": 1.3159455304195036, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.7370786666870117, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7293376922607422, + "num_tokens": 310016335.0, + "step": 11983 + }, + { + "epoch": 1.3160553481221173, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.082279920578003, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6978345513343811, + "num_tokens": 310051258.0, + "step": 11984 + }, + { + "epoch": 1.3161651658247309, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2295033931732178, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7029799818992615, + "num_tokens": 310084879.0, + "step": 11985 + }, + { + "epoch": 1.3162749835273446, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.423349142074585, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7317774295806885, + "num_tokens": 310110181.0, + "step": 11986 + }, + { + "epoch": 1.3163848012299582, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.439772844314575, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7402432560920715, + "num_tokens": 310132279.0, + "step": 11987 + }, + { + "epoch": 1.316494618932572, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.479982852935791, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7257939577102661, + "num_tokens": 310154905.0, + "step": 11988 + }, + { + "epoch": 1.3166044366351857, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3739469051361084, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7126693725585938, + "num_tokens": 310182554.0, + "step": 11989 + }, + { + "epoch": 1.3167142543377992, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1115200519561768, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7124592661857605, + "num_tokens": 310216138.0, + "step": 11990 + }, + { + "epoch": 1.316824072040413, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3290369510650635, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7107441425323486, + "num_tokens": 310243173.0, + "step": 11991 + }, + { + "epoch": 1.3169338897430265, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.514962911605835, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7131224274635315, + "num_tokens": 310266769.0, + "step": 11992 + }, + { + "epoch": 1.3170437074456403, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1687445640563965, + "learning_rate": 1e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6855690479278564, + "num_tokens": 310299363.0, + "step": 11993 + }, + { + "epoch": 1.317153525148254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 3.77549147605896, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6847593188285828, + "num_tokens": 310327603.0, + "step": 11994 + }, + { + "epoch": 1.3172633428508675, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.506293773651123, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7143950462341309, + "num_tokens": 310352373.0, + "step": 11995 + }, + { + "epoch": 1.3173731605534813, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.350593328475952, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7219159603118896, + "num_tokens": 310378023.0, + "step": 11996 + }, + { + "epoch": 1.3174829782560948, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1063904762268066, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6933114528656006, + "num_tokens": 310408772.0, + "step": 11997 + }, + { + "epoch": 1.3175927959587086, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.29363751411438, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7029930949211121, + "num_tokens": 310437185.0, + "step": 11998 + }, + { + "epoch": 1.3177026136613221, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.9695357084274292, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7125452756881714, + "num_tokens": 310472318.0, + "step": 11999 + }, + { + "epoch": 1.3178124313639359, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.1895017623901367, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7271274924278259, + "num_tokens": 310501902.0, + "step": 12000 + }, + { + "epoch": 1.3179222490665494, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5830161571502686, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7436307668685913, + "num_tokens": 310523070.0, + "step": 12001 + }, + { + "epoch": 1.3180320667691632, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3317174911499023, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7118899822235107, + "num_tokens": 310548975.0, + "step": 12002 + }, + { + "epoch": 1.318141884471777, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.134033679962158, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7467413544654846, + "num_tokens": 310578440.0, + "step": 12003 + }, + { + "epoch": 1.3182517021743905, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.3466827869415283, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6921108365058899, + "num_tokens": 310608050.0, + "step": 12004 + }, + { + "epoch": 1.3183615198770042, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.5101823806762695, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7094142436981201, + "num_tokens": 310634990.0, + "step": 12005 + }, + { + "epoch": 1.3184713375796178, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2772634029388428, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7107165455818176, + "num_tokens": 310663671.0, + "step": 12006 + }, + { + "epoch": 1.3185811552822315, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4928250312805176, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7333816289901733, + "num_tokens": 310686032.0, + "step": 12007 + }, + { + "epoch": 1.3186909729848453, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.28084659576416, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7058999538421631, + "num_tokens": 310715174.0, + "step": 12008 + }, + { + "epoch": 1.3188007906874588, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.4226434230804443, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6884456872940063, + "num_tokens": 310741269.0, + "step": 12009 + }, + { + "epoch": 1.3189106083900723, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.698428153991699, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7264312505722046, + "num_tokens": 310761641.0, + "step": 12010 + }, + { + "epoch": 1.319020426092686, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.53532338142395, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7203918695449829, + "num_tokens": 310784991.0, + "step": 12011 + }, + { + "epoch": 1.3191302437952999, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.6787712574005127, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6889760494232178, + "num_tokens": 310807560.0, + "step": 12012 + }, + { + "epoch": 1.3192400614979134, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.278357982635498, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7145992517471313, + "num_tokens": 310833251.0, + "step": 12013 + }, + { + "epoch": 1.3193498792005272, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.170231819152832, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7068576216697693, + "num_tokens": 310865625.0, + "step": 12014 + }, + { + "epoch": 1.3194596969031407, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.547959804534912, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7116156816482544, + "num_tokens": 310889811.0, + "step": 12015 + }, + { + "epoch": 1.3195695146057544, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.826446056365967, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7137597799301147, + "num_tokens": 310908529.0, + "step": 12016 + }, + { + "epoch": 1.3196793323083682, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.167739152908325, + "learning_rate": 1e-06, + "loss": 1.1279, + "mean_token_accuracy": 0.6735774278640747, + "num_tokens": 310940143.0, + "step": 12017 + }, + { + "epoch": 1.3197891500109817, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.737398386001587, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7141543030738831, + "num_tokens": 310960261.0, + "step": 12018 + }, + { + "epoch": 1.3198989677135955, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.649381399154663, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7078783512115479, + "num_tokens": 310983025.0, + "step": 12019 + }, + { + "epoch": 1.320008785416209, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.3922438621520996, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.703393816947937, + "num_tokens": 311008978.0, + "step": 12020 + }, + { + "epoch": 1.3201186031188228, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.268033742904663, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7213847637176514, + "num_tokens": 311037576.0, + "step": 12021 + }, + { + "epoch": 1.3202284208214365, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.5870673656463623, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6932236552238464, + "num_tokens": 311061893.0, + "step": 12022 + }, + { + "epoch": 1.32033823852405, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.7157745361328125, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7298189401626587, + "num_tokens": 311084726.0, + "step": 12023 + }, + { + "epoch": 1.3204480562266636, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.339864492416382, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7170227766036987, + "num_tokens": 311110099.0, + "step": 12024 + }, + { + "epoch": 1.3205578739292774, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.751516342163086, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7286577224731445, + "num_tokens": 311130690.0, + "step": 12025 + }, + { + "epoch": 1.3206676916318911, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.473571300506592, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6865694522857666, + "num_tokens": 311157366.0, + "step": 12026 + }, + { + "epoch": 1.3207775093345047, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.532230854034424, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7019211649894714, + "num_tokens": 311179686.0, + "step": 12027 + }, + { + "epoch": 1.3208873270371184, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.6201179027557373, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6964675188064575, + "num_tokens": 311203656.0, + "step": 12028 + }, + { + "epoch": 1.320997144739732, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.318228244781494, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7072365283966064, + "num_tokens": 311230369.0, + "step": 12029 + }, + { + "epoch": 1.3211069624423457, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.2023732662200928, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.730171263217926, + "num_tokens": 311260256.0, + "step": 12030 + }, + { + "epoch": 1.3212167801449595, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6145055294036865, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6938323974609375, + "num_tokens": 311282552.0, + "step": 12031 + }, + { + "epoch": 1.321326597847573, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.5048460960388184, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7345409393310547, + "num_tokens": 311304267.0, + "step": 12032 + }, + { + "epoch": 1.3214364155501868, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.347409963607788, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6935454607009888, + "num_tokens": 311330001.0, + "step": 12033 + }, + { + "epoch": 1.3215462332528003, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.536370277404785, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7122557759284973, + "num_tokens": 311355827.0, + "step": 12034 + }, + { + "epoch": 1.321656050955414, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.5290262699127197, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7153880596160889, + "num_tokens": 311379536.0, + "step": 12035 + }, + { + "epoch": 1.3217658686580278, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3893680572509766, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7179355621337891, + "num_tokens": 311404203.0, + "step": 12036 + }, + { + "epoch": 1.3218756863606413, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.405374526977539, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7206125259399414, + "num_tokens": 311429686.0, + "step": 12037 + }, + { + "epoch": 1.3219855040632549, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.2059097290039062, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7045162320137024, + "num_tokens": 311459298.0, + "step": 12038 + }, + { + "epoch": 1.3220953217658686, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.667553424835205, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7357257008552551, + "num_tokens": 311479998.0, + "step": 12039 + }, + { + "epoch": 1.3222051394684824, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.437279462814331, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6934236288070679, + "num_tokens": 311508414.0, + "step": 12040 + }, + { + "epoch": 1.322314957171096, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.4137349128723145, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7237758636474609, + "num_tokens": 311531813.0, + "step": 12041 + }, + { + "epoch": 1.3224247748737097, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.0958356857299805, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7132927775382996, + "num_tokens": 311565499.0, + "step": 12042 + }, + { + "epoch": 1.3225345925763232, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.7443575859069824, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7115544080734253, + "num_tokens": 311585634.0, + "step": 12043 + }, + { + "epoch": 1.322644410278937, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.2897379398345947, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7112315893173218, + "num_tokens": 311612145.0, + "step": 12044 + }, + { + "epoch": 1.3227542279815507, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.3009912967681885, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6921615600585938, + "num_tokens": 311637952.0, + "step": 12045 + }, + { + "epoch": 1.3228640456841643, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.1295688152313232, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7229698896408081, + "num_tokens": 311667932.0, + "step": 12046 + }, + { + "epoch": 1.322973863386778, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.441301107406616, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7210842370986938, + "num_tokens": 311691471.0, + "step": 12047 + }, + { + "epoch": 1.3230836810893916, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.704333543777466, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7183883190155029, + "num_tokens": 311710910.0, + "step": 12048 + }, + { + "epoch": 1.3231934987920053, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.524878978729248, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7306900024414062, + "num_tokens": 311733474.0, + "step": 12049 + }, + { + "epoch": 1.3233033164946189, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 2.745467185974121, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7091102600097656, + "num_tokens": 311754050.0, + "step": 12050 + }, + { + "epoch": 1.3234131341972326, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4237236976623535, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6960700750350952, + "num_tokens": 311782576.0, + "step": 12051 + }, + { + "epoch": 1.3235229518998461, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.322006940841675, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7231560349464417, + "num_tokens": 311810613.0, + "step": 12052 + }, + { + "epoch": 1.32363276960246, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.1435303688049316, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6848788857460022, + "num_tokens": 311845004.0, + "step": 12053 + }, + { + "epoch": 1.3237425873050737, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.7640464305877686, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7343901991844177, + "num_tokens": 311865498.0, + "step": 12054 + }, + { + "epoch": 1.3238524050076872, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.1646616458892822, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7020348906517029, + "num_tokens": 311895993.0, + "step": 12055 + }, + { + "epoch": 1.323962222710301, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.376520872116089, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7130544781684875, + "num_tokens": 311922674.0, + "step": 12056 + }, + { + "epoch": 1.3240720404129145, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6609792709350586, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7505491971969604, + "num_tokens": 311944228.0, + "step": 12057 + }, + { + "epoch": 1.3241818581155282, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4107096195220947, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7064436674118042, + "num_tokens": 311967283.0, + "step": 12058 + }, + { + "epoch": 1.324291675818142, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.8230576515197754, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7173984050750732, + "num_tokens": 311986322.0, + "step": 12059 + }, + { + "epoch": 1.3244014935207555, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4996306896209717, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7093480825424194, + "num_tokens": 312011426.0, + "step": 12060 + }, + { + "epoch": 1.324511311223369, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.215256929397583, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6873812079429626, + "num_tokens": 312043022.0, + "step": 12061 + }, + { + "epoch": 1.3246211289259828, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.361609935760498, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6862556338310242, + "num_tokens": 312073029.0, + "step": 12062 + }, + { + "epoch": 1.3247309466285966, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.408676862716675, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7216179370880127, + "num_tokens": 312098938.0, + "step": 12063 + }, + { + "epoch": 1.3248407643312101, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.543445587158203, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7087866067886353, + "num_tokens": 312122336.0, + "step": 12064 + }, + { + "epoch": 1.3249505820338239, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.514934539794922, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.700822114944458, + "num_tokens": 312145917.0, + "step": 12065 + }, + { + "epoch": 1.3250603997364374, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6654915809631348, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6984670758247375, + "num_tokens": 312167339.0, + "step": 12066 + }, + { + "epoch": 1.3251702174390512, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4697484970092773, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.719785213470459, + "num_tokens": 312189994.0, + "step": 12067 + }, + { + "epoch": 1.325280035141665, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.358428955078125, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7079479098320007, + "num_tokens": 312220602.0, + "step": 12068 + }, + { + "epoch": 1.3253898528442785, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.399479627609253, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7053365707397461, + "num_tokens": 312244048.0, + "step": 12069 + }, + { + "epoch": 1.3254996705468922, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.25911021232605, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7043153047561646, + "num_tokens": 312272462.0, + "step": 12070 + }, + { + "epoch": 1.3256094882495058, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.1646840572357178, + "learning_rate": 1e-06, + "loss": 1.099, + "mean_token_accuracy": 0.6868093013763428, + "num_tokens": 312304011.0, + "step": 12071 + }, + { + "epoch": 1.3257193059521195, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.052088737487793, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6994951963424683, + "num_tokens": 312339018.0, + "step": 12072 + }, + { + "epoch": 1.3258291236547333, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.330024480819702, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7021875977516174, + "num_tokens": 312365181.0, + "step": 12073 + }, + { + "epoch": 1.3259389413573468, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.968843936920166, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7339621782302856, + "num_tokens": 312380926.0, + "step": 12074 + }, + { + "epoch": 1.3260487590599603, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3122968673706055, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7171757221221924, + "num_tokens": 312408282.0, + "step": 12075 + }, + { + "epoch": 1.326158576762574, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4323394298553467, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7216417789459229, + "num_tokens": 312432842.0, + "step": 12076 + }, + { + "epoch": 1.3262683944651878, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3317484855651855, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7254122495651245, + "num_tokens": 312457932.0, + "step": 12077 + }, + { + "epoch": 1.3263782121678014, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3206708431243896, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7408074140548706, + "num_tokens": 312482510.0, + "step": 12078 + }, + { + "epoch": 1.3264880298704151, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.9661575555801392, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6881409883499146, + "num_tokens": 312522526.0, + "step": 12079 + }, + { + "epoch": 1.3265978475730287, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3866806030273438, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7043312788009644, + "num_tokens": 312547872.0, + "step": 12080 + }, + { + "epoch": 1.3267076652756424, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.607773542404175, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7390152812004089, + "num_tokens": 312567048.0, + "step": 12081 + }, + { + "epoch": 1.3268174829782562, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.434610366821289, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7113555073738098, + "num_tokens": 312590450.0, + "step": 12082 + }, + { + "epoch": 1.3269273006808697, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.240107297897339, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6950473785400391, + "num_tokens": 312621839.0, + "step": 12083 + }, + { + "epoch": 1.3270371183834835, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3449268341064453, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6926208138465881, + "num_tokens": 312649493.0, + "step": 12084 + }, + { + "epoch": 1.327146936086097, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3618130683898926, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.701050341129303, + "num_tokens": 312678801.0, + "step": 12085 + }, + { + "epoch": 1.3272567537887108, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.431751251220703, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7232224941253662, + "num_tokens": 312702894.0, + "step": 12086 + }, + { + "epoch": 1.3273665714913245, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.42596173286438, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7098547220230103, + "num_tokens": 312726396.0, + "step": 12087 + }, + { + "epoch": 1.327476389193938, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.169342517852783, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.697216272354126, + "num_tokens": 312755864.0, + "step": 12088 + }, + { + "epoch": 1.3275862068965516, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6090450286865234, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6888334155082703, + "num_tokens": 312780319.0, + "step": 12089 + }, + { + "epoch": 1.3276960245991654, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.261965274810791, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7073456645011902, + "num_tokens": 312808674.0, + "step": 12090 + }, + { + "epoch": 1.3278058423017791, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.350233554840088, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6955357789993286, + "num_tokens": 312834242.0, + "step": 12091 + }, + { + "epoch": 1.3279156600043927, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.574023485183716, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7236431837081909, + "num_tokens": 312856847.0, + "step": 12092 + }, + { + "epoch": 1.3280254777070064, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2941272258758545, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6955251693725586, + "num_tokens": 312885152.0, + "step": 12093 + }, + { + "epoch": 1.32813529540962, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.582735061645508, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.704522967338562, + "num_tokens": 312907134.0, + "step": 12094 + }, + { + "epoch": 1.3282451131122337, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6837432384490967, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.731278121471405, + "num_tokens": 312929375.0, + "step": 12095 + }, + { + "epoch": 1.3283549308148475, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.294337272644043, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6812421679496765, + "num_tokens": 312959592.0, + "step": 12096 + }, + { + "epoch": 1.328464748517461, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.327040433883667, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7220636606216431, + "num_tokens": 312984791.0, + "step": 12097 + }, + { + "epoch": 1.3285745662200747, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4536871910095215, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7269068360328674, + "num_tokens": 313008554.0, + "step": 12098 + }, + { + "epoch": 1.3286843839226883, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3353891372680664, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7037914991378784, + "num_tokens": 313036100.0, + "step": 12099 + }, + { + "epoch": 1.328794201625302, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4504294395446777, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7135206460952759, + "num_tokens": 313060742.0, + "step": 12100 + }, + { + "epoch": 1.3289040193279158, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3928489685058594, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7186142206192017, + "num_tokens": 313085469.0, + "step": 12101 + }, + { + "epoch": 1.3290138370305293, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.264209270477295, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6996334195137024, + "num_tokens": 313112793.0, + "step": 12102 + }, + { + "epoch": 1.3291236547331429, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.705418109893799, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7272005081176758, + "num_tokens": 313134516.0, + "step": 12103 + }, + { + "epoch": 1.3292334724357566, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.265113353729248, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.706686794757843, + "num_tokens": 313163023.0, + "step": 12104 + }, + { + "epoch": 1.3293432901383704, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1231439113616943, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7152193784713745, + "num_tokens": 313196724.0, + "step": 12105 + }, + { + "epoch": 1.329453107840984, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2013027667999268, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.708045482635498, + "num_tokens": 313227346.0, + "step": 12106 + }, + { + "epoch": 1.3295629255435977, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4184107780456543, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.700446605682373, + "num_tokens": 313255908.0, + "step": 12107 + }, + { + "epoch": 1.3296727432462112, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3909084796905518, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7076314687728882, + "num_tokens": 313282227.0, + "step": 12108 + }, + { + "epoch": 1.329782560948825, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.471355676651001, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7223735451698303, + "num_tokens": 313303207.0, + "step": 12109 + }, + { + "epoch": 1.3298923786514387, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.61055850982666, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6975080370903015, + "num_tokens": 313325047.0, + "step": 12110 + }, + { + "epoch": 1.3300021963540523, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.310417652130127, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6818653345108032, + "num_tokens": 313356491.0, + "step": 12111 + }, + { + "epoch": 1.330112014056666, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.139983892440796, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6950942277908325, + "num_tokens": 313389447.0, + "step": 12112 + }, + { + "epoch": 1.3302218317592795, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.5166046619415283, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7059744596481323, + "num_tokens": 313415681.0, + "step": 12113 + }, + { + "epoch": 1.3303316494618933, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.5241811275482178, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7158024907112122, + "num_tokens": 313440534.0, + "step": 12114 + }, + { + "epoch": 1.3304414671645068, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1612331867218018, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7086026072502136, + "num_tokens": 313472238.0, + "step": 12115 + }, + { + "epoch": 1.3305512848671206, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4731078147888184, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7020215392112732, + "num_tokens": 313496901.0, + "step": 12116 + }, + { + "epoch": 1.3306611025697341, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.454371213912964, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.690832793712616, + "num_tokens": 313521690.0, + "step": 12117 + }, + { + "epoch": 1.330770920272348, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4157214164733887, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7132308483123779, + "num_tokens": 313546813.0, + "step": 12118 + }, + { + "epoch": 1.3308807379749616, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2529711723327637, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7150365114212036, + "num_tokens": 313574997.0, + "step": 12119 + }, + { + "epoch": 1.3309905556775752, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.1245510578155518, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.699296236038208, + "num_tokens": 313607098.0, + "step": 12120 + }, + { + "epoch": 1.331100373380189, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.445354461669922, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.738998293876648, + "num_tokens": 313630093.0, + "step": 12121 + }, + { + "epoch": 1.3312101910828025, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.267001152038574, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7097780704498291, + "num_tokens": 313657207.0, + "step": 12122 + }, + { + "epoch": 1.3313200087854162, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.242151975631714, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.703345775604248, + "num_tokens": 313686686.0, + "step": 12123 + }, + { + "epoch": 1.33142982648803, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0987393856048584, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7030898332595825, + "num_tokens": 313718020.0, + "step": 12124 + }, + { + "epoch": 1.3315396441906435, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3534512519836426, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7192625999450684, + "num_tokens": 313744954.0, + "step": 12125 + }, + { + "epoch": 1.331649461893257, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.455994129180908, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6955668926239014, + "num_tokens": 313767651.0, + "step": 12126 + }, + { + "epoch": 1.3317592795958708, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3894147872924805, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.713135302066803, + "num_tokens": 313793030.0, + "step": 12127 + }, + { + "epoch": 1.3318690972984846, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.300718307495117, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7072130441665649, + "num_tokens": 313819612.0, + "step": 12128 + }, + { + "epoch": 1.331978915001098, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2271862030029297, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7198410630226135, + "num_tokens": 313847069.0, + "step": 12129 + }, + { + "epoch": 1.3320887327037119, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.233123540878296, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7240618467330933, + "num_tokens": 313875458.0, + "step": 12130 + }, + { + "epoch": 1.3321985504063254, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6500744819641113, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7249313592910767, + "num_tokens": 313896527.0, + "step": 12131 + }, + { + "epoch": 1.3323083681089392, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6396772861480713, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7072569131851196, + "num_tokens": 313918697.0, + "step": 12132 + }, + { + "epoch": 1.332418185811553, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.285529375076294, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7100604176521301, + "num_tokens": 313946077.0, + "step": 12133 + }, + { + "epoch": 1.3325280035141664, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.331831693649292, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7020770907402039, + "num_tokens": 313972542.0, + "step": 12134 + }, + { + "epoch": 1.3326378212167802, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.093897581100464, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7173594832420349, + "num_tokens": 314003809.0, + "step": 12135 + }, + { + "epoch": 1.3327476389193937, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.465935707092285, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6857812404632568, + "num_tokens": 314030491.0, + "step": 12136 + }, + { + "epoch": 1.3328574566220075, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.177326202392578, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7163897752761841, + "num_tokens": 314062339.0, + "step": 12137 + }, + { + "epoch": 1.3329672743246213, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2951385974884033, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7244035601615906, + "num_tokens": 314089700.0, + "step": 12138 + }, + { + "epoch": 1.3330770920272348, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.292523145675659, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6977971792221069, + "num_tokens": 314120020.0, + "step": 12139 + }, + { + "epoch": 1.3331869097298483, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4788925647735596, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7273839116096497, + "num_tokens": 314143549.0, + "step": 12140 + }, + { + "epoch": 1.333296727432462, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.175640106201172, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7120811939239502, + "num_tokens": 314173193.0, + "step": 12141 + }, + { + "epoch": 1.3334065451350758, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2664918899536133, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7060390710830688, + "num_tokens": 314200701.0, + "step": 12142 + }, + { + "epoch": 1.3335163628376894, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3466293811798096, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7232816219329834, + "num_tokens": 314224618.0, + "step": 12143 + }, + { + "epoch": 1.3336261805403031, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2201552391052246, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6945598721504211, + "num_tokens": 314254988.0, + "step": 12144 + }, + { + "epoch": 1.3337359982429167, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2593414783477783, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6996520161628723, + "num_tokens": 314282456.0, + "step": 12145 + }, + { + "epoch": 1.3338458159455304, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.548753499984741, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6914998292922974, + "num_tokens": 314305152.0, + "step": 12146 + }, + { + "epoch": 1.3339556336481442, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3581812381744385, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7047212719917297, + "num_tokens": 314333641.0, + "step": 12147 + }, + { + "epoch": 1.3340654513507577, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.0521726608276367, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6929402947425842, + "num_tokens": 314368978.0, + "step": 12148 + }, + { + "epoch": 1.3341752690533715, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2431814670562744, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7225286960601807, + "num_tokens": 314394530.0, + "step": 12149 + }, + { + "epoch": 1.334285086755985, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.497069835662842, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7080127000808716, + "num_tokens": 314417140.0, + "step": 12150 + }, + { + "epoch": 1.3343949044585988, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.435880422592163, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7046726942062378, + "num_tokens": 314440596.0, + "step": 12151 + }, + { + "epoch": 1.3345047221612125, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4196791648864746, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7221658825874329, + "num_tokens": 314462690.0, + "step": 12152 + }, + { + "epoch": 1.334614539863826, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6157033443450928, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6994826793670654, + "num_tokens": 314485297.0, + "step": 12153 + }, + { + "epoch": 1.3347243575664396, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.5309689044952393, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7085518836975098, + "num_tokens": 314511086.0, + "step": 12154 + }, + { + "epoch": 1.3348341752690533, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4608285427093506, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7343635559082031, + "num_tokens": 314534220.0, + "step": 12155 + }, + { + "epoch": 1.334943992971667, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3981082439422607, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6960762739181519, + "num_tokens": 314559951.0, + "step": 12156 + }, + { + "epoch": 1.3350538106742806, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.363211154937744, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7136750221252441, + "num_tokens": 314584385.0, + "step": 12157 + }, + { + "epoch": 1.3351636283768944, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4715778827667236, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7225453853607178, + "num_tokens": 314608107.0, + "step": 12158 + }, + { + "epoch": 1.335273446079508, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3382468223571777, + "learning_rate": 1e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.6796435713768005, + "num_tokens": 314636606.0, + "step": 12159 + }, + { + "epoch": 1.3353832637821217, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.434744119644165, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7202020883560181, + "num_tokens": 314660635.0, + "step": 12160 + }, + { + "epoch": 1.3354930814847354, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3346340656280518, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7175765037536621, + "num_tokens": 314687780.0, + "step": 12161 + }, + { + "epoch": 1.335602899187349, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.386068820953369, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7258478999137878, + "num_tokens": 314713069.0, + "step": 12162 + }, + { + "epoch": 1.3357127168899627, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.8035430908203125, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7065272331237793, + "num_tokens": 314737852.0, + "step": 12163 + }, + { + "epoch": 1.3358225345925763, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3262925148010254, + "learning_rate": 1e-06, + "loss": 1.0954, + "mean_token_accuracy": 0.6816897392272949, + "num_tokens": 314764199.0, + "step": 12164 + }, + { + "epoch": 1.33593235229519, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.458331346511841, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7187244296073914, + "num_tokens": 314788466.0, + "step": 12165 + }, + { + "epoch": 1.3360421699978036, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4181020259857178, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6990631818771362, + "num_tokens": 314814228.0, + "step": 12166 + }, + { + "epoch": 1.3361519877004173, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.0562515258789062, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6802211999893188, + "num_tokens": 314847990.0, + "step": 12167 + }, + { + "epoch": 1.3362618054030309, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.612837314605713, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7296961545944214, + "num_tokens": 314871778.0, + "step": 12168 + }, + { + "epoch": 1.3363716231056446, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.6583685874938965, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7080591917037964, + "num_tokens": 314894790.0, + "step": 12169 + }, + { + "epoch": 1.3364814408082584, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5128841400146484, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7082082033157349, + "num_tokens": 314918890.0, + "step": 12170 + }, + { + "epoch": 1.336591258510872, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4155688285827637, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7207161784172058, + "num_tokens": 314944008.0, + "step": 12171 + }, + { + "epoch": 1.3367010762134857, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.102053165435791, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6951777935028076, + "num_tokens": 314979013.0, + "step": 12172 + }, + { + "epoch": 1.3368108939160992, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3460984230041504, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7355998754501343, + "num_tokens": 315003209.0, + "step": 12173 + }, + { + "epoch": 1.336920711618713, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.297557830810547, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.728836178779602, + "num_tokens": 315029803.0, + "step": 12174 + }, + { + "epoch": 1.3370305293213267, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.4197545051574707, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.7037851214408875, + "num_tokens": 315058104.0, + "step": 12175 + }, + { + "epoch": 1.3371403470239402, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 7.078162670135498, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7249965071678162, + "num_tokens": 315080546.0, + "step": 12176 + }, + { + "epoch": 1.337250164726554, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.460407257080078, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7181174755096436, + "num_tokens": 315105139.0, + "step": 12177 + }, + { + "epoch": 1.3373599824291675, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.515024423599243, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.702164888381958, + "num_tokens": 315128254.0, + "step": 12178 + }, + { + "epoch": 1.3374698001317813, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.331451892852783, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7061474323272705, + "num_tokens": 315151826.0, + "step": 12179 + }, + { + "epoch": 1.3375796178343948, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.250326156616211, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7035887241363525, + "num_tokens": 315180659.0, + "step": 12180 + }, + { + "epoch": 1.3376894355370086, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.303447961807251, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7098150849342346, + "num_tokens": 315208419.0, + "step": 12181 + }, + { + "epoch": 1.3377992532396221, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.2374536991119385, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7168222069740295, + "num_tokens": 315233736.0, + "step": 12182 + }, + { + "epoch": 1.3379090709422359, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3416504859924316, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7395399212837219, + "num_tokens": 315258558.0, + "step": 12183 + }, + { + "epoch": 1.3380188886448496, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3654143810272217, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7065041661262512, + "num_tokens": 315284503.0, + "step": 12184 + }, + { + "epoch": 1.3381287063474632, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.3360695838928223, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7043874263763428, + "num_tokens": 315314227.0, + "step": 12185 + }, + { + "epoch": 1.338238524050077, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2028651237487793, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7367031574249268, + "num_tokens": 315341861.0, + "step": 12186 + }, + { + "epoch": 1.3383483417526905, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.7288341522216797, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.727722704410553, + "num_tokens": 315362198.0, + "step": 12187 + }, + { + "epoch": 1.3384581594553042, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.671037435531616, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.704399585723877, + "num_tokens": 315384800.0, + "step": 12188 + }, + { + "epoch": 1.338567977157918, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.404231071472168, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7142362594604492, + "num_tokens": 315412593.0, + "step": 12189 + }, + { + "epoch": 1.3386777948605315, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.50667405128479, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7118449211120605, + "num_tokens": 315437974.0, + "step": 12190 + }, + { + "epoch": 1.338787612563145, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2523677349090576, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6836181282997131, + "num_tokens": 315466659.0, + "step": 12191 + }, + { + "epoch": 1.3388974302657588, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.091503858566284, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.721255898475647, + "num_tokens": 315498402.0, + "step": 12192 + }, + { + "epoch": 1.3390072479683726, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.525543451309204, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7191526889801025, + "num_tokens": 315521133.0, + "step": 12193 + }, + { + "epoch": 1.339117065670986, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4558732509613037, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7319939136505127, + "num_tokens": 315545156.0, + "step": 12194 + }, + { + "epoch": 1.3392268833735999, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.274327039718628, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7086686491966248, + "num_tokens": 315571491.0, + "step": 12195 + }, + { + "epoch": 1.3393367010762134, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.395960807800293, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7184658050537109, + "num_tokens": 315597388.0, + "step": 12196 + }, + { + "epoch": 1.3394465187788271, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.7539916038513184, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7176588177680969, + "num_tokens": 315618018.0, + "step": 12197 + }, + { + "epoch": 1.339556336481441, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3254432678222656, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7206783294677734, + "num_tokens": 315644214.0, + "step": 12198 + }, + { + "epoch": 1.3396661541840544, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3814914226531982, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7188266515731812, + "num_tokens": 315670775.0, + "step": 12199 + }, + { + "epoch": 1.3397759718866682, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6560521125793457, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7144442796707153, + "num_tokens": 315692463.0, + "step": 12200 + }, + { + "epoch": 1.3398857895892817, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.485973358154297, + "learning_rate": 1e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6829327940940857, + "num_tokens": 315719338.0, + "step": 12201 + }, + { + "epoch": 1.3399956072918955, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2895874977111816, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6852701902389526, + "num_tokens": 315748717.0, + "step": 12202 + }, + { + "epoch": 1.3401054249945092, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.440786838531494, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7421097159385681, + "num_tokens": 315771304.0, + "step": 12203 + }, + { + "epoch": 1.3402152426971228, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3183517456054688, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7028151154518127, + "num_tokens": 315801733.0, + "step": 12204 + }, + { + "epoch": 1.3403250603997363, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4835667610168457, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7049344778060913, + "num_tokens": 315825516.0, + "step": 12205 + }, + { + "epoch": 1.34043487810235, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2972497940063477, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6988828778266907, + "num_tokens": 315854523.0, + "step": 12206 + }, + { + "epoch": 1.3405446958049638, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.335496664047241, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7141719460487366, + "num_tokens": 315881088.0, + "step": 12207 + }, + { + "epoch": 1.3406545135075774, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1546313762664795, + "learning_rate": 1e-06, + "loss": 1.1257, + "mean_token_accuracy": 0.6751189827919006, + "num_tokens": 315913033.0, + "step": 12208 + }, + { + "epoch": 1.3407643312101911, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.294772148132324, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.7013571858406067, + "num_tokens": 315939853.0, + "step": 12209 + }, + { + "epoch": 1.3408741489128047, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3054416179656982, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7027057409286499, + "num_tokens": 315964538.0, + "step": 12210 + }, + { + "epoch": 1.3409839666154184, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0194952487945557, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6999081373214722, + "num_tokens": 316001696.0, + "step": 12211 + }, + { + "epoch": 1.3410937843180322, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3019251823425293, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7075470685958862, + "num_tokens": 316027487.0, + "step": 12212 + }, + { + "epoch": 1.3412036020206457, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3298678398132324, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7187896966934204, + "num_tokens": 316052502.0, + "step": 12213 + }, + { + "epoch": 1.3413134197232595, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5339245796203613, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7322301268577576, + "num_tokens": 316074290.0, + "step": 12214 + }, + { + "epoch": 1.341423237425873, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.772779703140259, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7081665992736816, + "num_tokens": 316095713.0, + "step": 12215 + }, + { + "epoch": 1.3415330551284868, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.67569637298584, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7033931612968445, + "num_tokens": 316119090.0, + "step": 12216 + }, + { + "epoch": 1.3416428728311005, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.207979202270508, + "learning_rate": 1e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6690666079521179, + "num_tokens": 316151908.0, + "step": 12217 + }, + { + "epoch": 1.341752690533714, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6902527809143066, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7302675247192383, + "num_tokens": 316173112.0, + "step": 12218 + }, + { + "epoch": 1.3418625082363276, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.444934844970703, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7279512286186218, + "num_tokens": 316195721.0, + "step": 12219 + }, + { + "epoch": 1.3419723259389413, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2087771892547607, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7031219005584717, + "num_tokens": 316224989.0, + "step": 12220 + }, + { + "epoch": 1.342082143641555, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0592148303985596, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7005928158760071, + "num_tokens": 316259046.0, + "step": 12221 + }, + { + "epoch": 1.3421919613441686, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.557892084121704, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7258657813072205, + "num_tokens": 316281973.0, + "step": 12222 + }, + { + "epoch": 1.3423017790467824, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.448498010635376, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7040368318557739, + "num_tokens": 316307353.0, + "step": 12223 + }, + { + "epoch": 1.342411596749396, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.51313853263855, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7070510387420654, + "num_tokens": 316332042.0, + "step": 12224 + }, + { + "epoch": 1.3425214144520097, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.155588150024414, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7188233733177185, + "num_tokens": 316361949.0, + "step": 12225 + }, + { + "epoch": 1.3426312321546234, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2338552474975586, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7339427471160889, + "num_tokens": 316389388.0, + "step": 12226 + }, + { + "epoch": 1.342741049857237, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.620868682861328, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7185664176940918, + "num_tokens": 316411616.0, + "step": 12227 + }, + { + "epoch": 1.3428508675598507, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.523491621017456, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7168418169021606, + "num_tokens": 316436210.0, + "step": 12228 + }, + { + "epoch": 1.3429606852624643, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4762532711029053, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6938402056694031, + "num_tokens": 316460338.0, + "step": 12229 + }, + { + "epoch": 1.343070502965078, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.072307825088501, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7151684761047363, + "num_tokens": 316492222.0, + "step": 12230 + }, + { + "epoch": 1.3431803206676916, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.259901523590088, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7214856147766113, + "num_tokens": 316520060.0, + "step": 12231 + }, + { + "epoch": 1.3432901383703053, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0416438579559326, + "learning_rate": 1e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6854659914970398, + "num_tokens": 316557495.0, + "step": 12232 + }, + { + "epoch": 1.3433999560729188, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.505164623260498, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7028298377990723, + "num_tokens": 316580479.0, + "step": 12233 + }, + { + "epoch": 1.3435097737755326, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4458842277526855, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7159309983253479, + "num_tokens": 316604763.0, + "step": 12234 + }, + { + "epoch": 1.3436195914781464, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4097251892089844, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7334878444671631, + "num_tokens": 316628819.0, + "step": 12235 + }, + { + "epoch": 1.34372940918076, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.582212209701538, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7169220447540283, + "num_tokens": 316649717.0, + "step": 12236 + }, + { + "epoch": 1.3438392268833736, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.430814504623413, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.709393322467804, + "num_tokens": 316673641.0, + "step": 12237 + }, + { + "epoch": 1.3439490445859872, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2505416870117188, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7040622234344482, + "num_tokens": 316703302.0, + "step": 12238 + }, + { + "epoch": 1.344058862288601, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.273214101791382, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7119066715240479, + "num_tokens": 316731974.0, + "step": 12239 + }, + { + "epoch": 1.3441686799912147, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5539112091064453, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7198286056518555, + "num_tokens": 316753735.0, + "step": 12240 + }, + { + "epoch": 1.3442784976938282, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4684035778045654, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7259619235992432, + "num_tokens": 316777376.0, + "step": 12241 + }, + { + "epoch": 1.3443883153964418, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3952834606170654, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7296233177185059, + "num_tokens": 316803200.0, + "step": 12242 + }, + { + "epoch": 1.3444981330990555, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5035088062286377, + "learning_rate": 1e-06, + "loss": 1.1282, + "mean_token_accuracy": 0.6652299165725708, + "num_tokens": 316830502.0, + "step": 12243 + }, + { + "epoch": 1.3446079508016693, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3165078163146973, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.719131350517273, + "num_tokens": 316858731.0, + "step": 12244 + }, + { + "epoch": 1.3447177685042828, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1874983310699463, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6939740180969238, + "num_tokens": 316890304.0, + "step": 12245 + }, + { + "epoch": 1.3448275862068966, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.359649658203125, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7156398892402649, + "num_tokens": 316914171.0, + "step": 12246 + }, + { + "epoch": 1.34493740390951, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2038917541503906, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.718148946762085, + "num_tokens": 316945920.0, + "step": 12247 + }, + { + "epoch": 1.3450472216121239, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.7303855419158936, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.723223865032196, + "num_tokens": 316967233.0, + "step": 12248 + }, + { + "epoch": 1.3451570393147376, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1895439624786377, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7011481523513794, + "num_tokens": 316996454.0, + "step": 12249 + }, + { + "epoch": 1.3452668570173512, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4674606323242188, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7060437202453613, + "num_tokens": 317021126.0, + "step": 12250 + }, + { + "epoch": 1.345376674719965, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3103830814361572, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6950553059577942, + "num_tokens": 317049288.0, + "step": 12251 + }, + { + "epoch": 1.3454864924225785, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.406709909439087, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6981289982795715, + "num_tokens": 317075763.0, + "step": 12252 + }, + { + "epoch": 1.3455963101251922, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.171921491622925, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7173635959625244, + "num_tokens": 317105490.0, + "step": 12253 + }, + { + "epoch": 1.345706127827806, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5741517543792725, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7158893346786499, + "num_tokens": 317128532.0, + "step": 12254 + }, + { + "epoch": 1.3458159455304195, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6164581775665283, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7170606255531311, + "num_tokens": 317149149.0, + "step": 12255 + }, + { + "epoch": 1.345925763233033, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.1749448776245117, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7204016447067261, + "num_tokens": 317179995.0, + "step": 12256 + }, + { + "epoch": 1.3460355809356468, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.288167953491211, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7127327919006348, + "num_tokens": 317208166.0, + "step": 12257 + }, + { + "epoch": 1.3461453986382605, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4191553592681885, + "learning_rate": 1e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6824522018432617, + "num_tokens": 317235731.0, + "step": 12258 + }, + { + "epoch": 1.346255216340874, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6354281902313232, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7255272269248962, + "num_tokens": 317259161.0, + "step": 12259 + }, + { + "epoch": 1.3463650340434878, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.7208404541015625, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7062487602233887, + "num_tokens": 317280911.0, + "step": 12260 + }, + { + "epoch": 1.3464748517461014, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4445981979370117, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7104458212852478, + "num_tokens": 317307150.0, + "step": 12261 + }, + { + "epoch": 1.3465846694487151, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2435827255249023, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7105333209037781, + "num_tokens": 317336466.0, + "step": 12262 + }, + { + "epoch": 1.346694487151329, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6433560848236084, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7360459566116333, + "num_tokens": 317358547.0, + "step": 12263 + }, + { + "epoch": 1.3468043048539424, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.961497187614441, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.719079852104187, + "num_tokens": 317395401.0, + "step": 12264 + }, + { + "epoch": 1.3469141225565562, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3397531509399414, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.709972083568573, + "num_tokens": 317423197.0, + "step": 12265 + }, + { + "epoch": 1.3470239402591697, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4648702144622803, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.700214147567749, + "num_tokens": 317446824.0, + "step": 12266 + }, + { + "epoch": 1.3471337579617835, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.352091073989868, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7132611870765686, + "num_tokens": 317471013.0, + "step": 12267 + }, + { + "epoch": 1.3472435756643972, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 2.495124340057373, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7271027565002441, + "num_tokens": 317494213.0, + "step": 12268 + }, + { + "epoch": 1.3473533933670108, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2669730186462402, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7154252529144287, + "num_tokens": 317521586.0, + "step": 12269 + }, + { + "epoch": 1.3474632110696243, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.335747241973877, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7160106897354126, + "num_tokens": 317550203.0, + "step": 12270 + }, + { + "epoch": 1.347573028772238, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2495503425598145, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7174484133720398, + "num_tokens": 317577786.0, + "step": 12271 + }, + { + "epoch": 1.3476828464748518, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.309410572052002, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6955080032348633, + "num_tokens": 317605820.0, + "step": 12272 + }, + { + "epoch": 1.3477926641774653, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.312772035598755, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6856541633605957, + "num_tokens": 317632019.0, + "step": 12273 + }, + { + "epoch": 1.347902481880079, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2601070404052734, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7065940499305725, + "num_tokens": 317660099.0, + "step": 12274 + }, + { + "epoch": 1.3480122995826926, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3699347972869873, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7016015648841858, + "num_tokens": 317686365.0, + "step": 12275 + }, + { + "epoch": 1.3481221172853064, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6339080333709717, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7130469083786011, + "num_tokens": 317707757.0, + "step": 12276 + }, + { + "epoch": 1.3482319349879202, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.459273338317871, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7340067625045776, + "num_tokens": 317731141.0, + "step": 12277 + }, + { + "epoch": 1.3483417526905337, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3391737937927246, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7271029949188232, + "num_tokens": 317758618.0, + "step": 12278 + }, + { + "epoch": 1.3484515703931474, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.488903284072876, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.708998441696167, + "num_tokens": 317780188.0, + "step": 12279 + }, + { + "epoch": 1.348561388095761, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.407560348510742, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7356204986572266, + "num_tokens": 317803852.0, + "step": 12280 + }, + { + "epoch": 1.3486712057983747, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0952751636505127, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7035213112831116, + "num_tokens": 317834475.0, + "step": 12281 + }, + { + "epoch": 1.3487810235009885, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3415064811706543, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.732836902141571, + "num_tokens": 317860962.0, + "step": 12282 + }, + { + "epoch": 1.348890841203602, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.305222272872925, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7137615084648132, + "num_tokens": 317887167.0, + "step": 12283 + }, + { + "epoch": 1.3490006589062156, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.8117146492004395, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7290281057357788, + "num_tokens": 317907342.0, + "step": 12284 + }, + { + "epoch": 1.3491104766088293, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.791013717651367, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7090662717819214, + "num_tokens": 317929542.0, + "step": 12285 + }, + { + "epoch": 1.349220294311443, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6718060970306396, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7054578065872192, + "num_tokens": 317950297.0, + "step": 12286 + }, + { + "epoch": 1.3493301120140566, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3050687313079834, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6952341794967651, + "num_tokens": 317978347.0, + "step": 12287 + }, + { + "epoch": 1.3494399297166704, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4000701904296875, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.728270411491394, + "num_tokens": 318003315.0, + "step": 12288 + }, + { + "epoch": 1.349549747419284, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.565049409866333, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7214522361755371, + "num_tokens": 318027089.0, + "step": 12289 + }, + { + "epoch": 1.3496595651218977, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6182680130004883, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7151360511779785, + "num_tokens": 318049983.0, + "step": 12290 + }, + { + "epoch": 1.3497693828245114, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.37862229347229, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.7052386999130249, + "num_tokens": 318076446.0, + "step": 12291 + }, + { + "epoch": 1.349879200527125, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6702663898468018, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7186082601547241, + "num_tokens": 318099282.0, + "step": 12292 + }, + { + "epoch": 1.3499890182297387, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.398930311203003, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7070754766464233, + "num_tokens": 318125169.0, + "step": 12293 + }, + { + "epoch": 1.3500988359323522, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.066788911819458, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.733871340751648, + "num_tokens": 318157779.0, + "step": 12294 + }, + { + "epoch": 1.350208653634966, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.0954301357269287, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7381372451782227, + "num_tokens": 318187301.0, + "step": 12295 + }, + { + "epoch": 1.3503184713375795, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 32.3385009765625, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7052473425865173, + "num_tokens": 318212475.0, + "step": 12296 + }, + { + "epoch": 1.3504282890401933, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.211452007293701, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6987451910972595, + "num_tokens": 318245545.0, + "step": 12297 + }, + { + "epoch": 1.3505381067428068, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.487607002258301, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7237628102302551, + "num_tokens": 318268347.0, + "step": 12298 + }, + { + "epoch": 1.3506479244454206, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2118306159973145, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7187067270278931, + "num_tokens": 318296588.0, + "step": 12299 + }, + { + "epoch": 1.3507577421480343, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5385220050811768, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7171313762664795, + "num_tokens": 318318387.0, + "step": 12300 + }, + { + "epoch": 1.3508675598506479, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4352498054504395, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.748787522315979, + "num_tokens": 318340938.0, + "step": 12301 + }, + { + "epoch": 1.3509773775532616, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.54069185256958, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7159793972969055, + "num_tokens": 318362842.0, + "step": 12302 + }, + { + "epoch": 1.3510871952558752, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3514907360076904, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7254162430763245, + "num_tokens": 318388176.0, + "step": 12303 + }, + { + "epoch": 1.351197012958489, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4888617992401123, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.725144624710083, + "num_tokens": 318411027.0, + "step": 12304 + }, + { + "epoch": 1.3513068306611027, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.5411899089813232, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7228337526321411, + "num_tokens": 318431454.0, + "step": 12305 + }, + { + "epoch": 1.3514166483637162, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4328722953796387, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7009101510047913, + "num_tokens": 318456081.0, + "step": 12306 + }, + { + "epoch": 1.3515264660663298, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2671501636505127, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7079384326934814, + "num_tokens": 318484595.0, + "step": 12307 + }, + { + "epoch": 1.3516362837689435, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6751890182495117, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.698788046836853, + "num_tokens": 318507267.0, + "step": 12308 + }, + { + "epoch": 1.3517461014715573, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.581923484802246, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7279452085494995, + "num_tokens": 318530460.0, + "step": 12309 + }, + { + "epoch": 1.3518559191741708, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.354029655456543, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7049447298049927, + "num_tokens": 318557169.0, + "step": 12310 + }, + { + "epoch": 1.3519657368767846, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2912192344665527, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6984090805053711, + "num_tokens": 318585845.0, + "step": 12311 + }, + { + "epoch": 1.352075554579398, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.6473772525787354, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6980181336402893, + "num_tokens": 318607694.0, + "step": 12312 + }, + { + "epoch": 1.3521853722820119, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4721474647521973, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7063920497894287, + "num_tokens": 318630886.0, + "step": 12313 + }, + { + "epoch": 1.3522951899846256, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.673574686050415, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7271362543106079, + "num_tokens": 318651318.0, + "step": 12314 + }, + { + "epoch": 1.3524050076872391, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.2236056327819824, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7032871842384338, + "num_tokens": 318681369.0, + "step": 12315 + }, + { + "epoch": 1.352514825389853, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4070394039154053, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6992222666740417, + "num_tokens": 318708443.0, + "step": 12316 + }, + { + "epoch": 1.3526246430924664, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.75944185256958, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7137762904167175, + "num_tokens": 318728753.0, + "step": 12317 + }, + { + "epoch": 1.3527344607950802, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6615896224975586, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.7033141851425171, + "num_tokens": 318752398.0, + "step": 12318 + }, + { + "epoch": 1.352844278497694, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4471547603607178, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6915143728256226, + "num_tokens": 318777523.0, + "step": 12319 + }, + { + "epoch": 1.3529540962003075, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.359100580215454, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7076247930526733, + "num_tokens": 318804092.0, + "step": 12320 + }, + { + "epoch": 1.353063913902921, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4090051651000977, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7170605659484863, + "num_tokens": 318830073.0, + "step": 12321 + }, + { + "epoch": 1.3531737316055348, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.366640329360962, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7100811004638672, + "num_tokens": 318855429.0, + "step": 12322 + }, + { + "epoch": 1.3532835493081485, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.45619797706604, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7239691615104675, + "num_tokens": 318878567.0, + "step": 12323 + }, + { + "epoch": 1.353393367010762, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.304107904434204, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7307708263397217, + "num_tokens": 318905612.0, + "step": 12324 + }, + { + "epoch": 1.3535031847133758, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3651747703552246, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7073693871498108, + "num_tokens": 318932808.0, + "step": 12325 + }, + { + "epoch": 1.3536130024159894, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4233908653259277, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.7040331363677979, + "num_tokens": 318958431.0, + "step": 12326 + }, + { + "epoch": 1.3537228201186031, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.352572441101074, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7267664670944214, + "num_tokens": 318981995.0, + "step": 12327 + }, + { + "epoch": 1.3538326378212169, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.3658320903778076, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6890365481376648, + "num_tokens": 319011387.0, + "step": 12328 + }, + { + "epoch": 1.3539424555238304, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.140604257583618, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6833000183105469, + "num_tokens": 319044796.0, + "step": 12329 + }, + { + "epoch": 1.3540522732264442, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.169062376022339, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.707679271697998, + "num_tokens": 319074717.0, + "step": 12330 + }, + { + "epoch": 1.3541620909290577, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 2.4501445293426514, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6894632577896118, + "num_tokens": 319099557.0, + "step": 12331 + }, + { + "epoch": 1.3542719086316715, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5489602088928223, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.721448540687561, + "num_tokens": 319123101.0, + "step": 12332 + }, + { + "epoch": 1.3543817263342852, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6449131965637207, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7111820578575134, + "num_tokens": 319145466.0, + "step": 12333 + }, + { + "epoch": 1.3544915440368988, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.9091719388961792, + "learning_rate": 1e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.6813324689865112, + "num_tokens": 319184665.0, + "step": 12334 + }, + { + "epoch": 1.3546013617395123, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6224770545959473, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6947405934333801, + "num_tokens": 319205907.0, + "step": 12335 + }, + { + "epoch": 1.354711179442126, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6700596809387207, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7153016328811646, + "num_tokens": 319227066.0, + "step": 12336 + }, + { + "epoch": 1.3548209971447398, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.505539894104004, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6953269839286804, + "num_tokens": 319252101.0, + "step": 12337 + }, + { + "epoch": 1.3549308148473533, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.5469675064086914, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7108391523361206, + "num_tokens": 319275052.0, + "step": 12338 + }, + { + "epoch": 1.355040632549967, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.527738571166992, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7405964136123657, + "num_tokens": 319295313.0, + "step": 12339 + }, + { + "epoch": 1.3551504502525806, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.380307912826538, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.7004331350326538, + "num_tokens": 319321658.0, + "step": 12340 + }, + { + "epoch": 1.3552602679551944, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6262829303741455, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7130441069602966, + "num_tokens": 319344828.0, + "step": 12341 + }, + { + "epoch": 1.3553700856578081, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.307344913482666, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.709693431854248, + "num_tokens": 319372964.0, + "step": 12342 + }, + { + "epoch": 1.3554799033604217, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.5518417358398438, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7142731547355652, + "num_tokens": 319397019.0, + "step": 12343 + }, + { + "epoch": 1.3555897210630354, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.431957960128784, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6874655485153198, + "num_tokens": 319423150.0, + "step": 12344 + }, + { + "epoch": 1.355699538765649, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.632932186126709, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7340791821479797, + "num_tokens": 319444028.0, + "step": 12345 + }, + { + "epoch": 1.3558093564682627, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.47068190574646, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6994638442993164, + "num_tokens": 319471726.0, + "step": 12346 + }, + { + "epoch": 1.3559191741708763, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.4483675956726074, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7022684812545776, + "num_tokens": 319496920.0, + "step": 12347 + }, + { + "epoch": 1.35602899187349, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.285902738571167, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7337287664413452, + "num_tokens": 319522394.0, + "step": 12348 + }, + { + "epoch": 1.3561388095761036, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.193310260772705, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7283987998962402, + "num_tokens": 319549366.0, + "step": 12349 + }, + { + "epoch": 1.3562486272787173, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3798017501831055, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.73163902759552, + "num_tokens": 319573807.0, + "step": 12350 + }, + { + "epoch": 1.356358444981331, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5202033519744873, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7093996405601501, + "num_tokens": 319597163.0, + "step": 12351 + }, + { + "epoch": 1.3564682626839446, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.220259666442871, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7242580652236938, + "num_tokens": 319624851.0, + "step": 12352 + }, + { + "epoch": 1.3565780803865584, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.492380380630493, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6973305940628052, + "num_tokens": 319648659.0, + "step": 12353 + }, + { + "epoch": 1.356687898089172, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7725934982299805, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7047461271286011, + "num_tokens": 319668633.0, + "step": 12354 + }, + { + "epoch": 1.3567977157917857, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3495473861694336, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7080960869789124, + "num_tokens": 319697321.0, + "step": 12355 + }, + { + "epoch": 1.3569075334943994, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.214432716369629, + "learning_rate": 1e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6841772794723511, + "num_tokens": 319727386.0, + "step": 12356 + }, + { + "epoch": 1.357017351197013, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4261326789855957, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7130306959152222, + "num_tokens": 319751705.0, + "step": 12357 + }, + { + "epoch": 1.3571271688996267, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4387362003326416, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7213331460952759, + "num_tokens": 319777094.0, + "step": 12358 + }, + { + "epoch": 1.3572369866022402, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1999728679656982, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7069538831710815, + "num_tokens": 319806021.0, + "step": 12359 + }, + { + "epoch": 1.357346804304854, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.289727210998535, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7034679055213928, + "num_tokens": 319834341.0, + "step": 12360 + }, + { + "epoch": 1.3574566220074675, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2346105575561523, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6776024103164673, + "num_tokens": 319863895.0, + "step": 12361 + }, + { + "epoch": 1.3575664397100813, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.3023464679718018, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6956901550292969, + "num_tokens": 319893533.0, + "step": 12362 + }, + { + "epoch": 1.3576762574126948, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.2026708126068115, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.690717339515686, + "num_tokens": 319925237.0, + "step": 12363 + }, + { + "epoch": 1.3577860751153086, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.4540460109710693, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7049077749252319, + "num_tokens": 319953740.0, + "step": 12364 + }, + { + "epoch": 1.3578958928179223, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.5976755619049072, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7308581471443176, + "num_tokens": 319975003.0, + "step": 12365 + }, + { + "epoch": 1.3580057105205359, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.209571599960327, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7079296112060547, + "num_tokens": 320005624.0, + "step": 12366 + }, + { + "epoch": 1.3581155282231496, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5149312019348145, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7117098569869995, + "num_tokens": 320030292.0, + "step": 12367 + }, + { + "epoch": 1.3582253459257632, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.413754463195801, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7006391882896423, + "num_tokens": 320054812.0, + "step": 12368 + }, + { + "epoch": 1.358335163628377, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2639400959014893, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7099674940109253, + "num_tokens": 320083315.0, + "step": 12369 + }, + { + "epoch": 1.3584449813309907, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.450406789779663, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.7030619978904724, + "num_tokens": 320107574.0, + "step": 12370 + }, + { + "epoch": 1.3585547990336042, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.25573992729187, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6945062875747681, + "num_tokens": 320137930.0, + "step": 12371 + }, + { + "epoch": 1.3586646167362177, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.678880214691162, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7045856714248657, + "num_tokens": 320160452.0, + "step": 12372 + }, + { + "epoch": 1.3587744344388315, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.417294502258301, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6990327835083008, + "num_tokens": 320186376.0, + "step": 12373 + }, + { + "epoch": 1.3588842521414453, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4501519203186035, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6972768306732178, + "num_tokens": 320212668.0, + "step": 12374 + }, + { + "epoch": 1.3589940698440588, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.247023582458496, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7134562134742737, + "num_tokens": 320242462.0, + "step": 12375 + }, + { + "epoch": 1.3591038875466726, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2660882472991943, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7255645990371704, + "num_tokens": 320270761.0, + "step": 12376 + }, + { + "epoch": 1.359213705249286, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3028643131256104, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.723666787147522, + "num_tokens": 320299388.0, + "step": 12377 + }, + { + "epoch": 1.3593235229518998, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.193882703781128, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7166763544082642, + "num_tokens": 320329246.0, + "step": 12378 + }, + { + "epoch": 1.3594333406545136, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3644425868988037, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7084015011787415, + "num_tokens": 320357784.0, + "step": 12379 + }, + { + "epoch": 1.3595431583571271, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1601712703704834, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7124082446098328, + "num_tokens": 320388559.0, + "step": 12380 + }, + { + "epoch": 1.359652976059741, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.31632399559021, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7002904415130615, + "num_tokens": 320413902.0, + "step": 12381 + }, + { + "epoch": 1.3597627937623544, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7688729763031006, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7117006182670593, + "num_tokens": 320433528.0, + "step": 12382 + }, + { + "epoch": 1.3598726114649682, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3115410804748535, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6906867027282715, + "num_tokens": 320461910.0, + "step": 12383 + }, + { + "epoch": 1.359982429167582, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.474463939666748, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7061104774475098, + "num_tokens": 320488243.0, + "step": 12384 + }, + { + "epoch": 1.3600922468701955, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5114879608154297, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6926246881484985, + "num_tokens": 320513492.0, + "step": 12385 + }, + { + "epoch": 1.360202064572809, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.288097620010376, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6958069801330566, + "num_tokens": 320541935.0, + "step": 12386 + }, + { + "epoch": 1.3603118822754228, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.912558078765869, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7399142980575562, + "num_tokens": 320559125.0, + "step": 12387 + }, + { + "epoch": 1.3604216999780365, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.395949125289917, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7085040807723999, + "num_tokens": 320583098.0, + "step": 12388 + }, + { + "epoch": 1.36053151768065, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6580677032470703, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7096225619316101, + "num_tokens": 320603576.0, + "step": 12389 + }, + { + "epoch": 1.3606413353832638, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.9703562259674072, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6886448860168457, + "num_tokens": 320644237.0, + "step": 12390 + }, + { + "epoch": 1.3607511530858774, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.2649028301239014, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6922686100006104, + "num_tokens": 320673042.0, + "step": 12391 + }, + { + "epoch": 1.360860970788491, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.545607566833496, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7127153277397156, + "num_tokens": 320696520.0, + "step": 12392 + }, + { + "epoch": 1.3609707884911049, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.388559341430664, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7324307560920715, + "num_tokens": 320719902.0, + "step": 12393 + }, + { + "epoch": 1.3610806061937184, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.070826292037964, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.710644006729126, + "num_tokens": 320753425.0, + "step": 12394 + }, + { + "epoch": 1.3611904238963322, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.3760604858398438, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7093532085418701, + "num_tokens": 320780438.0, + "step": 12395 + }, + { + "epoch": 1.3613002415989457, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.5617847442626953, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7230486869812012, + "num_tokens": 320801567.0, + "step": 12396 + }, + { + "epoch": 1.3614100593015594, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.5654430389404297, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7160036563873291, + "num_tokens": 320825226.0, + "step": 12397 + }, + { + "epoch": 1.3615198770041732, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6036715507507324, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7287435531616211, + "num_tokens": 320845317.0, + "step": 12398 + }, + { + "epoch": 1.3616296947067867, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.943591833114624, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7181079983711243, + "num_tokens": 320863226.0, + "step": 12399 + }, + { + "epoch": 1.3617395124094003, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.6387901306152344, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7216245532035828, + "num_tokens": 320884355.0, + "step": 12400 + }, + { + "epoch": 1.361849330112014, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4626402854919434, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7118820548057556, + "num_tokens": 320909532.0, + "step": 12401 + }, + { + "epoch": 1.3619591478146278, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4204580783843994, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7020421028137207, + "num_tokens": 320937048.0, + "step": 12402 + }, + { + "epoch": 1.3620689655172413, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4295082092285156, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7139676809310913, + "num_tokens": 320960287.0, + "step": 12403 + }, + { + "epoch": 1.362178783219855, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.783979654312134, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6973652839660645, + "num_tokens": 320979821.0, + "step": 12404 + }, + { + "epoch": 1.3622886009224686, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.676053762435913, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6910510659217834, + "num_tokens": 321002289.0, + "step": 12405 + }, + { + "epoch": 1.3623984186250824, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.301795244216919, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6884008646011353, + "num_tokens": 321031301.0, + "step": 12406 + }, + { + "epoch": 1.3625082363276961, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6308226585388184, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.698158860206604, + "num_tokens": 321054918.0, + "step": 12407 + }, + { + "epoch": 1.3626180540303097, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3167548179626465, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7222995758056641, + "num_tokens": 321081261.0, + "step": 12408 + }, + { + "epoch": 1.3627278717329234, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.223101854324341, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7244037389755249, + "num_tokens": 321109810.0, + "step": 12409 + }, + { + "epoch": 1.362837689435537, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.56032657623291, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7139907479286194, + "num_tokens": 321132982.0, + "step": 12410 + }, + { + "epoch": 1.3629475071381507, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.249229669570923, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7164564728736877, + "num_tokens": 321162779.0, + "step": 12411 + }, + { + "epoch": 1.3630573248407643, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7087159156799316, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7475900650024414, + "num_tokens": 321181944.0, + "step": 12412 + }, + { + "epoch": 1.363167142543378, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.282740592956543, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.698523998260498, + "num_tokens": 321210380.0, + "step": 12413 + }, + { + "epoch": 1.3632769602459915, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.486481189727783, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7100210189819336, + "num_tokens": 321235617.0, + "step": 12414 + }, + { + "epoch": 1.3633867779486053, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.456505537033081, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7154718637466431, + "num_tokens": 321260493.0, + "step": 12415 + }, + { + "epoch": 1.363496595651219, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.65636944770813, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7285866141319275, + "num_tokens": 321282094.0, + "step": 12416 + }, + { + "epoch": 1.3636064133538326, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1487724781036377, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.703164279460907, + "num_tokens": 321313841.0, + "step": 12417 + }, + { + "epoch": 1.3637162310564463, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.532021999359131, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7143319249153137, + "num_tokens": 321336344.0, + "step": 12418 + }, + { + "epoch": 1.3638260487590599, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.098393201828003, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6950912475585938, + "num_tokens": 321369387.0, + "step": 12419 + }, + { + "epoch": 1.3639358664616736, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 3.2094290256500244, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7198121547698975, + "num_tokens": 321395461.0, + "step": 12420 + }, + { + "epoch": 1.3640456841642874, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3911075592041016, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7228726148605347, + "num_tokens": 321420665.0, + "step": 12421 + }, + { + "epoch": 1.364155501866901, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3959336280822754, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7198913097381592, + "num_tokens": 321446602.0, + "step": 12422 + }, + { + "epoch": 1.3642653195695145, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.427408456802368, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7016381621360779, + "num_tokens": 321472037.0, + "step": 12423 + }, + { + "epoch": 1.3643751372721282, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3212087154388428, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7118439674377441, + "num_tokens": 321498650.0, + "step": 12424 + }, + { + "epoch": 1.364484954974742, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.384894371032715, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6915037631988525, + "num_tokens": 321524056.0, + "step": 12425 + }, + { + "epoch": 1.3645947726773555, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5280747413635254, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6932892203330994, + "num_tokens": 321548443.0, + "step": 12426 + }, + { + "epoch": 1.3647045903799693, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.293813705444336, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6977489590644836, + "num_tokens": 321578386.0, + "step": 12427 + }, + { + "epoch": 1.3648144080825828, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2490081787109375, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7100739479064941, + "num_tokens": 321607798.0, + "step": 12428 + }, + { + "epoch": 1.3649242257851966, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2323951721191406, + "learning_rate": 1e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.681027889251709, + "num_tokens": 321637720.0, + "step": 12429 + }, + { + "epoch": 1.3650340434878103, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4323196411132812, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6910091638565063, + "num_tokens": 321665072.0, + "step": 12430 + }, + { + "epoch": 1.3651438611904239, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3703866004943848, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7198339104652405, + "num_tokens": 321690054.0, + "step": 12431 + }, + { + "epoch": 1.3652536788930376, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.042661666870117, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7218179106712341, + "num_tokens": 321722282.0, + "step": 12432 + }, + { + "epoch": 1.3653634965956511, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.185333013534546, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6896030902862549, + "num_tokens": 321754184.0, + "step": 12433 + }, + { + "epoch": 1.365473314298265, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.578021287918091, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7504146099090576, + "num_tokens": 321774303.0, + "step": 12434 + }, + { + "epoch": 1.3655831320008787, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.489434242248535, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7105371952056885, + "num_tokens": 321799656.0, + "step": 12435 + }, + { + "epoch": 1.3656929497034922, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.907132625579834, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.70956951379776, + "num_tokens": 321818801.0, + "step": 12436 + }, + { + "epoch": 1.3658027674061057, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.408224105834961, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6979354023933411, + "num_tokens": 321844992.0, + "step": 12437 + }, + { + "epoch": 1.3659125851087195, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6846139430999756, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7126175165176392, + "num_tokens": 321866970.0, + "step": 12438 + }, + { + "epoch": 1.3660224028113332, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3499503135681152, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.747471272945404, + "num_tokens": 321891675.0, + "step": 12439 + }, + { + "epoch": 1.3661322205139468, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.027618169784546, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7147530317306519, + "num_tokens": 321926398.0, + "step": 12440 + }, + { + "epoch": 1.3662420382165605, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.443446636199951, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7259711623191833, + "num_tokens": 321950293.0, + "step": 12441 + }, + { + "epoch": 1.366351855919174, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2293405532836914, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7140762209892273, + "num_tokens": 321979849.0, + "step": 12442 + }, + { + "epoch": 1.3664616736217878, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6130363941192627, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7138085961341858, + "num_tokens": 322001928.0, + "step": 12443 + }, + { + "epoch": 1.3665714913244016, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3516600131988525, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7168358564376831, + "num_tokens": 322029905.0, + "step": 12444 + }, + { + "epoch": 1.3666813090270151, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4697465896606445, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7026984095573425, + "num_tokens": 322055755.0, + "step": 12445 + }, + { + "epoch": 1.3667911267296289, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5060744285583496, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6907304525375366, + "num_tokens": 322081185.0, + "step": 12446 + }, + { + "epoch": 1.3669009444322424, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5450189113616943, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7066338658332825, + "num_tokens": 322107051.0, + "step": 12447 + }, + { + "epoch": 1.3670107621348562, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.510436773300171, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7265337705612183, + "num_tokens": 322130687.0, + "step": 12448 + }, + { + "epoch": 1.36712057983747, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2316761016845703, + "learning_rate": 1e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6698524951934814, + "num_tokens": 322164725.0, + "step": 12449 + }, + { + "epoch": 1.3672303975400835, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.124107837677002, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7134804725646973, + "num_tokens": 322193166.0, + "step": 12450 + }, + { + "epoch": 1.367340215242697, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6360585689544678, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7375048398971558, + "num_tokens": 322212005.0, + "step": 12451 + }, + { + "epoch": 1.3674500329453108, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3529114723205566, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6921422481536865, + "num_tokens": 322239553.0, + "step": 12452 + }, + { + "epoch": 1.3675598506479245, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2244484424591064, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7117897868156433, + "num_tokens": 322269202.0, + "step": 12453 + }, + { + "epoch": 1.367669668350538, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3962578773498535, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6950786709785461, + "num_tokens": 322297129.0, + "step": 12454 + }, + { + "epoch": 1.3677794860531518, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.119706630706787, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.697261393070221, + "num_tokens": 322331302.0, + "step": 12455 + }, + { + "epoch": 1.3678893037557653, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.480502128601074, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.70676589012146, + "num_tokens": 322356109.0, + "step": 12456 + }, + { + "epoch": 1.367999121458379, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.473914861679077, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6897566318511963, + "num_tokens": 322382372.0, + "step": 12457 + }, + { + "epoch": 1.3681089391609929, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2618072032928467, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7311354279518127, + "num_tokens": 322408429.0, + "step": 12458 + }, + { + "epoch": 1.3682187568636064, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4333877563476562, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7151293754577637, + "num_tokens": 322433048.0, + "step": 12459 + }, + { + "epoch": 1.3683285745662201, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3285324573516846, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7198481559753418, + "num_tokens": 322458545.0, + "step": 12460 + }, + { + "epoch": 1.3684383922688337, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.947105884552002, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.731603741645813, + "num_tokens": 322478183.0, + "step": 12461 + }, + { + "epoch": 1.3685482099714474, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7555413246154785, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7064081430435181, + "num_tokens": 322499421.0, + "step": 12462 + }, + { + "epoch": 1.3686580276740612, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.292839288711548, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7152348160743713, + "num_tokens": 322528571.0, + "step": 12463 + }, + { + "epoch": 1.3687678453766747, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5389466285705566, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6963577270507812, + "num_tokens": 322552969.0, + "step": 12464 + }, + { + "epoch": 1.3688776630792883, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.588960886001587, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7073358297348022, + "num_tokens": 322576081.0, + "step": 12465 + }, + { + "epoch": 1.368987480781902, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4430737495422363, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7101935148239136, + "num_tokens": 322605259.0, + "step": 12466 + }, + { + "epoch": 1.3690972984845158, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2959866523742676, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.719231128692627, + "num_tokens": 322632366.0, + "step": 12467 + }, + { + "epoch": 1.3692071161871293, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4797463417053223, + "learning_rate": 1e-06, + "loss": 1.0835, + "mean_token_accuracy": 0.6781190037727356, + "num_tokens": 322658014.0, + "step": 12468 + }, + { + "epoch": 1.369316933889743, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.516312599182129, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.7112226486206055, + "num_tokens": 322685064.0, + "step": 12469 + }, + { + "epoch": 1.3694267515923566, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4626479148864746, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.721453845500946, + "num_tokens": 322711501.0, + "step": 12470 + }, + { + "epoch": 1.3695365692949704, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.340813636779785, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7251882553100586, + "num_tokens": 322738897.0, + "step": 12471 + }, + { + "epoch": 1.3696463869975841, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.482089042663574, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7181259393692017, + "num_tokens": 322762890.0, + "step": 12472 + }, + { + "epoch": 1.3697562047001977, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4137158393859863, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6981921792030334, + "num_tokens": 322788541.0, + "step": 12473 + }, + { + "epoch": 1.3698660224028114, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5129432678222656, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7053542137145996, + "num_tokens": 322812278.0, + "step": 12474 + }, + { + "epoch": 1.369975840105425, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.219627618789673, + "learning_rate": 1e-06, + "loss": 1.1187, + "mean_token_accuracy": 0.6798639297485352, + "num_tokens": 322844266.0, + "step": 12475 + }, + { + "epoch": 1.3700856578080387, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.577599287033081, + "learning_rate": 1e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6901744604110718, + "num_tokens": 322867518.0, + "step": 12476 + }, + { + "epoch": 1.3701954755106522, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.569431781768799, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7166010141372681, + "num_tokens": 322891856.0, + "step": 12477 + }, + { + "epoch": 1.370305293213266, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4483022689819336, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6897430419921875, + "num_tokens": 322916582.0, + "step": 12478 + }, + { + "epoch": 1.3704151109158795, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.680612564086914, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7085502743721008, + "num_tokens": 322938281.0, + "step": 12479 + }, + { + "epoch": 1.3705249286184933, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6575400829315186, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.710692286491394, + "num_tokens": 322961516.0, + "step": 12480 + }, + { + "epoch": 1.370634746321107, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.8841350078582764, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7397642135620117, + "num_tokens": 322979819.0, + "step": 12481 + }, + { + "epoch": 1.3707445640237206, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1171181201934814, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.7003012895584106, + "num_tokens": 323013915.0, + "step": 12482 + }, + { + "epoch": 1.3708543817263343, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6561880111694336, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7284036874771118, + "num_tokens": 323036826.0, + "step": 12483 + }, + { + "epoch": 1.3709641994289479, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2130837440490723, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7007043957710266, + "num_tokens": 323067041.0, + "step": 12484 + }, + { + "epoch": 1.3710740171315616, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6123390197753906, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7081905603408813, + "num_tokens": 323090266.0, + "step": 12485 + }, + { + "epoch": 1.3711838348341754, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.435091257095337, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6940959692001343, + "num_tokens": 323116084.0, + "step": 12486 + }, + { + "epoch": 1.371293652536789, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2550089359283447, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7128429412841797, + "num_tokens": 323145061.0, + "step": 12487 + }, + { + "epoch": 1.3714034702394025, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5271735191345215, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7143896222114563, + "num_tokens": 323167555.0, + "step": 12488 + }, + { + "epoch": 1.3715132879420162, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6042957305908203, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7225880026817322, + "num_tokens": 323189204.0, + "step": 12489 + }, + { + "epoch": 1.37162310564463, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4471993446350098, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7227343916893005, + "num_tokens": 323212801.0, + "step": 12490 + }, + { + "epoch": 1.3717329233472435, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2493176460266113, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7230125069618225, + "num_tokens": 323239307.0, + "step": 12491 + }, + { + "epoch": 1.3718427410498573, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.491156816482544, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.7015492916107178, + "num_tokens": 323264180.0, + "step": 12492 + }, + { + "epoch": 1.3719525587524708, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.345496892929077, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6978289484977722, + "num_tokens": 323292103.0, + "step": 12493 + }, + { + "epoch": 1.3720623764550846, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.493309497833252, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7069961428642273, + "num_tokens": 323315127.0, + "step": 12494 + }, + { + "epoch": 1.3721721941576983, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4836947917938232, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6887620091438293, + "num_tokens": 323340495.0, + "step": 12495 + }, + { + "epoch": 1.3722820118603118, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4755208492279053, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6871476173400879, + "num_tokens": 323366720.0, + "step": 12496 + }, + { + "epoch": 1.3723918295629256, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.478996753692627, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.720000147819519, + "num_tokens": 323390323.0, + "step": 12497 + }, + { + "epoch": 1.3725016472655391, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4722745418548584, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7192597985267639, + "num_tokens": 323412899.0, + "step": 12498 + }, + { + "epoch": 1.372611464968153, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 3.0416765213012695, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7230152487754822, + "num_tokens": 323430627.0, + "step": 12499 + }, + { + "epoch": 1.3727212826707667, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.633035898208618, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7262129187583923, + "num_tokens": 323451115.0, + "step": 12500 + }, + { + "epoch": 1.3728311003733802, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4358954429626465, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7058215141296387, + "num_tokens": 323477932.0, + "step": 12501 + }, + { + "epoch": 1.3729409180759937, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.258354425430298, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.702150285243988, + "num_tokens": 323507554.0, + "step": 12502 + }, + { + "epoch": 1.3730507357786075, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3122589588165283, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6972970962524414, + "num_tokens": 323536096.0, + "step": 12503 + }, + { + "epoch": 1.3731605534812212, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2703704833984375, + "learning_rate": 1e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6785141229629517, + "num_tokens": 323568211.0, + "step": 12504 + }, + { + "epoch": 1.3732703711838348, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2125959396362305, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6957696676254272, + "num_tokens": 323597033.0, + "step": 12505 + }, + { + "epoch": 1.3733801888864485, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2922799587249756, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7252346277236938, + "num_tokens": 323623987.0, + "step": 12506 + }, + { + "epoch": 1.373490006589062, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2176249027252197, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7126275300979614, + "num_tokens": 323651896.0, + "step": 12507 + }, + { + "epoch": 1.3735998242916758, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.297377586364746, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.695169985294342, + "num_tokens": 323679655.0, + "step": 12508 + }, + { + "epoch": 1.3737096419942896, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.406038761138916, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7038881778717041, + "num_tokens": 323704418.0, + "step": 12509 + }, + { + "epoch": 1.3738194596969031, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.8424694538116455, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7065271139144897, + "num_tokens": 323727290.0, + "step": 12510 + }, + { + "epoch": 1.3739292773995169, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.308657646179199, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6922060251235962, + "num_tokens": 323756261.0, + "step": 12511 + }, + { + "epoch": 1.3740390951021304, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1875932216644287, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6931047439575195, + "num_tokens": 323785724.0, + "step": 12512 + }, + { + "epoch": 1.3741489128047442, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.35386323928833, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6932477951049805, + "num_tokens": 323812965.0, + "step": 12513 + }, + { + "epoch": 1.374258730507358, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4245543479919434, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7305442094802856, + "num_tokens": 323838244.0, + "step": 12514 + }, + { + "epoch": 1.3743685482099715, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.66546368598938, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7379381656646729, + "num_tokens": 323858223.0, + "step": 12515 + }, + { + "epoch": 1.374478365912585, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.137317419052124, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6991677284240723, + "num_tokens": 323890650.0, + "step": 12516 + }, + { + "epoch": 1.3745881836151987, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.355290651321411, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7092140316963196, + "num_tokens": 323915138.0, + "step": 12517 + }, + { + "epoch": 1.3746980013178125, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5676722526550293, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6924234628677368, + "num_tokens": 323939118.0, + "step": 12518 + }, + { + "epoch": 1.374807819020426, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1556296348571777, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7080180644989014, + "num_tokens": 323967959.0, + "step": 12519 + }, + { + "epoch": 1.3749176367230398, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5904054641723633, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7132743000984192, + "num_tokens": 323989323.0, + "step": 12520 + }, + { + "epoch": 1.3750274544256533, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6640536785125732, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7199428677558899, + "num_tokens": 324010929.0, + "step": 12521 + }, + { + "epoch": 1.375137272128267, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.8072118759155273, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7403556108474731, + "num_tokens": 324029764.0, + "step": 12522 + }, + { + "epoch": 1.3752470898308808, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5579066276550293, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7301908731460571, + "num_tokens": 324051254.0, + "step": 12523 + }, + { + "epoch": 1.3753569075334944, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5386762619018555, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7191095948219299, + "num_tokens": 324073161.0, + "step": 12524 + }, + { + "epoch": 1.3754667252361081, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4933547973632812, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7125496864318848, + "num_tokens": 324096249.0, + "step": 12525 + }, + { + "epoch": 1.3755765429387217, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5491578578948975, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7069387435913086, + "num_tokens": 324119618.0, + "step": 12526 + }, + { + "epoch": 1.3756863606413354, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3143763542175293, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7080186605453491, + "num_tokens": 324149285.0, + "step": 12527 + }, + { + "epoch": 1.3757961783439492, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.778351306915283, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.709986686706543, + "num_tokens": 324170406.0, + "step": 12528 + }, + { + "epoch": 1.3759059960465627, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3457961082458496, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6931235790252686, + "num_tokens": 324198163.0, + "step": 12529 + }, + { + "epoch": 1.3760158137491763, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2863640785217285, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7073370814323425, + "num_tokens": 324226223.0, + "step": 12530 + }, + { + "epoch": 1.37612563145179, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.406576156616211, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.69490647315979, + "num_tokens": 324252547.0, + "step": 12531 + }, + { + "epoch": 1.3762354491544038, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 3.0190584659576416, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7215524911880493, + "num_tokens": 324269247.0, + "step": 12532 + }, + { + "epoch": 1.3763452668570173, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3547871112823486, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7003589868545532, + "num_tokens": 324294598.0, + "step": 12533 + }, + { + "epoch": 1.376455084559631, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.331815719604492, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6987959146499634, + "num_tokens": 324322960.0, + "step": 12534 + }, + { + "epoch": 1.3765649022622446, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5857229232788086, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7376872301101685, + "num_tokens": 324345051.0, + "step": 12535 + }, + { + "epoch": 1.3766747199648584, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5066206455230713, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6956368684768677, + "num_tokens": 324370134.0, + "step": 12536 + }, + { + "epoch": 1.376784537667472, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.350586175918579, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6841849684715271, + "num_tokens": 324396827.0, + "step": 12537 + }, + { + "epoch": 1.3768943553700856, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.8064513206481934, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7068215608596802, + "num_tokens": 324419711.0, + "step": 12538 + }, + { + "epoch": 1.3770041730726994, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5887248516082764, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7090859413146973, + "num_tokens": 324442608.0, + "step": 12539 + }, + { + "epoch": 1.377113990775313, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.671097993850708, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7109845876693726, + "num_tokens": 324463862.0, + "step": 12540 + }, + { + "epoch": 1.3772238084779267, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.578427791595459, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7152920961380005, + "num_tokens": 324485779.0, + "step": 12541 + }, + { + "epoch": 1.3773336261805402, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.422271251678467, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6919058561325073, + "num_tokens": 324513413.0, + "step": 12542 + }, + { + "epoch": 1.377443443883154, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.1005806922912598, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7196369171142578, + "num_tokens": 324543901.0, + "step": 12543 + }, + { + "epoch": 1.3775532615857675, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.6921417713165283, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6962572336196899, + "num_tokens": 324566134.0, + "step": 12544 + }, + { + "epoch": 1.3776630792883813, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.207904577255249, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6871073246002197, + "num_tokens": 324594770.0, + "step": 12545 + }, + { + "epoch": 1.377772896990995, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4512295722961426, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6996121406555176, + "num_tokens": 324619443.0, + "step": 12546 + }, + { + "epoch": 1.3778827146936086, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.676769495010376, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7159326076507568, + "num_tokens": 324639903.0, + "step": 12547 + }, + { + "epoch": 1.3779925323962223, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4567270278930664, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7222081422805786, + "num_tokens": 324661797.0, + "step": 12548 + }, + { + "epoch": 1.3781023500988359, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.359589099884033, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7288724184036255, + "num_tokens": 324686076.0, + "step": 12549 + }, + { + "epoch": 1.3782121678014496, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5111374855041504, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7283599376678467, + "num_tokens": 324709315.0, + "step": 12550 + }, + { + "epoch": 1.3783219855040634, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3962631225585938, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7251160144805908, + "num_tokens": 324733142.0, + "step": 12551 + }, + { + "epoch": 1.378431803206677, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.465684175491333, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7301579713821411, + "num_tokens": 324756808.0, + "step": 12552 + }, + { + "epoch": 1.3785416209092904, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.715013265609741, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7081282734870911, + "num_tokens": 324777993.0, + "step": 12553 + }, + { + "epoch": 1.3786514386119042, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.212151288986206, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.734217643737793, + "num_tokens": 324806322.0, + "step": 12554 + }, + { + "epoch": 1.378761256314518, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 6.905122756958008, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6934564709663391, + "num_tokens": 324837557.0, + "step": 12555 + }, + { + "epoch": 1.3788710740171315, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.8114571571350098, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6943478584289551, + "num_tokens": 324857980.0, + "step": 12556 + }, + { + "epoch": 1.3789808917197452, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4440274238586426, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7012506127357483, + "num_tokens": 324887487.0, + "step": 12557 + }, + { + "epoch": 1.3790907094223588, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.6461803913116455, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7355731129646301, + "num_tokens": 324908476.0, + "step": 12558 + }, + { + "epoch": 1.3792005271249725, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4484710693359375, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7178001403808594, + "num_tokens": 324934794.0, + "step": 12559 + }, + { + "epoch": 1.3793103448275863, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2606217861175537, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7271102070808411, + "num_tokens": 324965560.0, + "step": 12560 + }, + { + "epoch": 1.3794201625301998, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.304753303527832, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7118343114852905, + "num_tokens": 324992511.0, + "step": 12561 + }, + { + "epoch": 1.3795299802328136, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3664867877960205, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7195373773574829, + "num_tokens": 325019487.0, + "step": 12562 + }, + { + "epoch": 1.3796397979354271, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.449253559112549, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.711887776851654, + "num_tokens": 325043901.0, + "step": 12563 + }, + { + "epoch": 1.3797496156380409, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.337639093399048, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7228744029998779, + "num_tokens": 325068013.0, + "step": 12564 + }, + { + "epoch": 1.3798594333406546, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.481401205062866, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6994602084159851, + "num_tokens": 325094329.0, + "step": 12565 + }, + { + "epoch": 1.3799692510432682, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2995803356170654, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7124105095863342, + "num_tokens": 325121449.0, + "step": 12566 + }, + { + "epoch": 1.3800790687458817, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3549282550811768, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7168013453483582, + "num_tokens": 325146118.0, + "step": 12567 + }, + { + "epoch": 1.3801888864484955, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.254382610321045, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7022712230682373, + "num_tokens": 325177571.0, + "step": 12568 + }, + { + "epoch": 1.3802987041511092, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2648959159851074, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6858487129211426, + "num_tokens": 325207045.0, + "step": 12569 + }, + { + "epoch": 1.3804085218537228, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.323611259460449, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7069482207298279, + "num_tokens": 325233060.0, + "step": 12570 + }, + { + "epoch": 1.3805183395563365, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5168254375457764, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7240892648696899, + "num_tokens": 325255279.0, + "step": 12571 + }, + { + "epoch": 1.38062815725895, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4868643283843994, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.708562433719635, + "num_tokens": 325279494.0, + "step": 12572 + }, + { + "epoch": 1.3807379749615638, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2737462520599365, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7417697906494141, + "num_tokens": 325303432.0, + "step": 12573 + }, + { + "epoch": 1.3808477926641776, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.380138874053955, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6965858936309814, + "num_tokens": 325329242.0, + "step": 12574 + }, + { + "epoch": 1.380957610366791, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.330984115600586, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7052697539329529, + "num_tokens": 325355446.0, + "step": 12575 + }, + { + "epoch": 1.3810674280694049, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2857744693756104, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7043266892433167, + "num_tokens": 325382954.0, + "step": 12576 + }, + { + "epoch": 1.3811772457720184, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.422959804534912, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6967345476150513, + "num_tokens": 325407603.0, + "step": 12577 + }, + { + "epoch": 1.3812870634746321, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2936365604400635, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.694614052772522, + "num_tokens": 325434185.0, + "step": 12578 + }, + { + "epoch": 1.381396881177246, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.247205972671509, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7138320803642273, + "num_tokens": 325461343.0, + "step": 12579 + }, + { + "epoch": 1.3815066988798594, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.528514862060547, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.700467586517334, + "num_tokens": 325483556.0, + "step": 12580 + }, + { + "epoch": 1.381616516582473, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.459630012512207, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7005164623260498, + "num_tokens": 325510162.0, + "step": 12581 + }, + { + "epoch": 1.3817263342850867, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.5211353302001953, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7414956092834473, + "num_tokens": 325533463.0, + "step": 12582 + }, + { + "epoch": 1.3818361519877005, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4079034328460693, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7378140687942505, + "num_tokens": 325558235.0, + "step": 12583 + }, + { + "epoch": 1.381945969690314, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.580502510070801, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7164984941482544, + "num_tokens": 325579757.0, + "step": 12584 + }, + { + "epoch": 1.3820557873929278, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3691625595092773, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.690665066242218, + "num_tokens": 325605573.0, + "step": 12585 + }, + { + "epoch": 1.3821656050955413, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.403027296066284, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7100287675857544, + "num_tokens": 325629148.0, + "step": 12586 + }, + { + "epoch": 1.382275422798155, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.436981439590454, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7089915871620178, + "num_tokens": 325655848.0, + "step": 12587 + }, + { + "epoch": 1.3823852405007688, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5293667316436768, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7119213342666626, + "num_tokens": 325679168.0, + "step": 12588 + }, + { + "epoch": 1.3824950582033824, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.333117723464966, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6921375393867493, + "num_tokens": 325711489.0, + "step": 12589 + }, + { + "epoch": 1.3826048759059961, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.347475051879883, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6838525533676147, + "num_tokens": 325740922.0, + "step": 12590 + }, + { + "epoch": 1.3827146936086097, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.235421657562256, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6969583034515381, + "num_tokens": 325770207.0, + "step": 12591 + }, + { + "epoch": 1.3828245113112234, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.219681978225708, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7331933975219727, + "num_tokens": 325798524.0, + "step": 12592 + }, + { + "epoch": 1.382934329013837, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.4900715351104736, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6909174919128418, + "num_tokens": 325825687.0, + "step": 12593 + }, + { + "epoch": 1.3830441467164507, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.293278694152832, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7090486884117126, + "num_tokens": 325854967.0, + "step": 12594 + }, + { + "epoch": 1.3831539644190642, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2639784812927246, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6867557764053345, + "num_tokens": 325887490.0, + "step": 12595 + }, + { + "epoch": 1.383263782121678, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7633235454559326, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6956384181976318, + "num_tokens": 325908570.0, + "step": 12596 + }, + { + "epoch": 1.3833735998242918, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.716108560562134, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7135635018348694, + "num_tokens": 325928528.0, + "step": 12597 + }, + { + "epoch": 1.3834834175269053, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3508036136627197, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6971433758735657, + "num_tokens": 325956713.0, + "step": 12598 + }, + { + "epoch": 1.383593235229519, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3222529888153076, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7057069540023804, + "num_tokens": 325984839.0, + "step": 12599 + }, + { + "epoch": 1.3837030529321326, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.207413911819458, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7196296453475952, + "num_tokens": 326011963.0, + "step": 12600 + }, + { + "epoch": 1.3838128706347463, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.399495840072632, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7155447006225586, + "num_tokens": 326037627.0, + "step": 12601 + }, + { + "epoch": 1.38392268833736, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.151475667953491, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7106427550315857, + "num_tokens": 326068689.0, + "step": 12602 + }, + { + "epoch": 1.3840325060399736, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2946953773498535, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7027891278266907, + "num_tokens": 326099103.0, + "step": 12603 + }, + { + "epoch": 1.3841423237425874, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.497770071029663, + "learning_rate": 1e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.6819535493850708, + "num_tokens": 326125592.0, + "step": 12604 + }, + { + "epoch": 1.384252141445201, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2234508991241455, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7094069719314575, + "num_tokens": 326154108.0, + "step": 12605 + }, + { + "epoch": 1.3843619591478147, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4952259063720703, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7174848914146423, + "num_tokens": 326178614.0, + "step": 12606 + }, + { + "epoch": 1.3844717768504282, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3282361030578613, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6940176486968994, + "num_tokens": 326205864.0, + "step": 12607 + }, + { + "epoch": 1.384581594553042, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.58864688873291, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7011171579360962, + "num_tokens": 326229546.0, + "step": 12608 + }, + { + "epoch": 1.3846914122556555, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.279775381088257, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6977181434631348, + "num_tokens": 326257447.0, + "step": 12609 + }, + { + "epoch": 1.3848012299582693, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.505326271057129, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7162932753562927, + "num_tokens": 326279636.0, + "step": 12610 + }, + { + "epoch": 1.384911047660883, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3413190841674805, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6976635456085205, + "num_tokens": 326307847.0, + "step": 12611 + }, + { + "epoch": 1.3850208653634966, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.425360918045044, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7012579441070557, + "num_tokens": 326334739.0, + "step": 12612 + }, + { + "epoch": 1.3851306830661103, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 7.210052967071533, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.735841691493988, + "num_tokens": 326354685.0, + "step": 12613 + }, + { + "epoch": 1.3852405007687238, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.327913522720337, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6919025182723999, + "num_tokens": 326384519.0, + "step": 12614 + }, + { + "epoch": 1.3853503184713376, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.593947410583496, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7304808497428894, + "num_tokens": 326407163.0, + "step": 12615 + }, + { + "epoch": 1.3854601361739514, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.342214822769165, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7040297389030457, + "num_tokens": 326432569.0, + "step": 12616 + }, + { + "epoch": 1.385569953876565, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2558276653289795, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6938461065292358, + "num_tokens": 326462060.0, + "step": 12617 + }, + { + "epoch": 1.3856797715791784, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4171338081359863, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7224772572517395, + "num_tokens": 326487727.0, + "step": 12618 + }, + { + "epoch": 1.3857895892817922, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.59120774269104, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7244666814804077, + "num_tokens": 326510858.0, + "step": 12619 + }, + { + "epoch": 1.385899406984406, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.5963521003723145, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7192081212997437, + "num_tokens": 326532694.0, + "step": 12620 + }, + { + "epoch": 1.3860092246870195, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.345827341079712, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.721748411655426, + "num_tokens": 326560597.0, + "step": 12621 + }, + { + "epoch": 1.3861190423896332, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.600351333618164, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6815088391304016, + "num_tokens": 326582964.0, + "step": 12622 + }, + { + "epoch": 1.3862288600922468, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.332498788833618, + "learning_rate": 1e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6936262845993042, + "num_tokens": 326609534.0, + "step": 12623 + }, + { + "epoch": 1.3863386777948605, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.403153657913208, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7260448932647705, + "num_tokens": 326634252.0, + "step": 12624 + }, + { + "epoch": 1.3864484954974743, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4271352291107178, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7243629693984985, + "num_tokens": 326658361.0, + "step": 12625 + }, + { + "epoch": 1.3865583132000878, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.63785457611084, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7263569831848145, + "num_tokens": 326678597.0, + "step": 12626 + }, + { + "epoch": 1.3866681309027016, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4172158241271973, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6994041204452515, + "num_tokens": 326707463.0, + "step": 12627 + }, + { + "epoch": 1.3867779486053151, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.1823976039886475, + "learning_rate": 1e-06, + "loss": 1.1069, + "mean_token_accuracy": 0.6784493923187256, + "num_tokens": 326740808.0, + "step": 12628 + }, + { + "epoch": 1.3868877663079289, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.172808885574341, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.7025083303451538, + "num_tokens": 326770351.0, + "step": 12629 + }, + { + "epoch": 1.3869975840105426, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3474395275115967, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7200130224227905, + "num_tokens": 326795628.0, + "step": 12630 + }, + { + "epoch": 1.3871074017131562, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.1058013439178467, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7030739784240723, + "num_tokens": 326828612.0, + "step": 12631 + }, + { + "epoch": 1.3872172194157697, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4508368968963623, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7198420166969299, + "num_tokens": 326852073.0, + "step": 12632 + }, + { + "epoch": 1.3873270371183835, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.343655586242676, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7341527342796326, + "num_tokens": 326877643.0, + "step": 12633 + }, + { + "epoch": 1.3874368548209972, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4081320762634277, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6996885538101196, + "num_tokens": 326902042.0, + "step": 12634 + }, + { + "epoch": 1.3875466725236107, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.349658489227295, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7088065147399902, + "num_tokens": 326927492.0, + "step": 12635 + }, + { + "epoch": 1.3876564902262245, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.2516322135925293, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7097862958908081, + "num_tokens": 326955640.0, + "step": 12636 + }, + { + "epoch": 1.387766307928838, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.3111355304718018, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.705879807472229, + "num_tokens": 326982934.0, + "step": 12637 + }, + { + "epoch": 1.3878761256314518, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 2.7728424072265625, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.701470136642456, + "num_tokens": 327003284.0, + "step": 12638 + }, + { + "epoch": 1.3879859433340656, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.065092086791992, + "learning_rate": 1e-06, + "loss": 1.1038, + "mean_token_accuracy": 0.6754385232925415, + "num_tokens": 327039297.0, + "step": 12639 + }, + { + "epoch": 1.388095761036679, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2986183166503906, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.6844263672828674, + "num_tokens": 327068485.0, + "step": 12640 + }, + { + "epoch": 1.3882055787392928, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3152105808258057, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7061073780059814, + "num_tokens": 327094322.0, + "step": 12641 + }, + { + "epoch": 1.3883153964419064, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.493180513381958, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7389823794364929, + "num_tokens": 327118567.0, + "step": 12642 + }, + { + "epoch": 1.3884252141445201, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3225016593933105, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7119134664535522, + "num_tokens": 327144865.0, + "step": 12643 + }, + { + "epoch": 1.388535031847134, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1349213123321533, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7000113725662231, + "num_tokens": 327174998.0, + "step": 12644 + }, + { + "epoch": 1.3886448495497474, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3272933959960938, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6922560930252075, + "num_tokens": 327200509.0, + "step": 12645 + }, + { + "epoch": 1.388754667252361, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2819371223449707, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7208272218704224, + "num_tokens": 327225745.0, + "step": 12646 + }, + { + "epoch": 1.3888644849549747, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2671573162078857, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6870167255401611, + "num_tokens": 327254984.0, + "step": 12647 + }, + { + "epoch": 1.3889743026575885, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2749626636505127, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7198851108551025, + "num_tokens": 327279223.0, + "step": 12648 + }, + { + "epoch": 1.389084120360202, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.5626566410064697, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7117232084274292, + "num_tokens": 327303319.0, + "step": 12649 + }, + { + "epoch": 1.3891939380628158, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.2049849033355713, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7022589445114136, + "num_tokens": 327331675.0, + "step": 12650 + }, + { + "epoch": 1.3893037557654293, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.4419429302215576, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7105154991149902, + "num_tokens": 327355990.0, + "step": 12651 + }, + { + "epoch": 1.389413573468043, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4797356128692627, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7216559648513794, + "num_tokens": 327379324.0, + "step": 12652 + }, + { + "epoch": 1.3895233911706568, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.5025203227996826, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7000551223754883, + "num_tokens": 327402498.0, + "step": 12653 + }, + { + "epoch": 1.3896332088732704, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.484034776687622, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7025166749954224, + "num_tokens": 327426181.0, + "step": 12654 + }, + { + "epoch": 1.389743026575884, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.204101800918579, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.703490674495697, + "num_tokens": 327455846.0, + "step": 12655 + }, + { + "epoch": 1.3898528442784976, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3075084686279297, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6801541447639465, + "num_tokens": 327486203.0, + "step": 12656 + }, + { + "epoch": 1.3899626619811114, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.562387228012085, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7340822815895081, + "num_tokens": 327510687.0, + "step": 12657 + }, + { + "epoch": 1.390072479683725, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.3435723781585693, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7093050479888916, + "num_tokens": 327536849.0, + "step": 12658 + }, + { + "epoch": 1.3901822973863387, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.461458683013916, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.704447329044342, + "num_tokens": 327559433.0, + "step": 12659 + }, + { + "epoch": 1.3902921150889522, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 2.621711254119873, + "learning_rate": 1e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7573543787002563, + "num_tokens": 327580154.0, + "step": 12660 + }, + { + "epoch": 1.390401932791566, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.605623722076416, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7271620035171509, + "num_tokens": 327602813.0, + "step": 12661 + }, + { + "epoch": 1.3905117504941797, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5103728771209717, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7235984206199646, + "num_tokens": 327629603.0, + "step": 12662 + }, + { + "epoch": 1.3906215681967933, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6494531631469727, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7041721343994141, + "num_tokens": 327654648.0, + "step": 12663 + }, + { + "epoch": 1.390731385899407, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1867835521698, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6957781314849854, + "num_tokens": 327684120.0, + "step": 12664 + }, + { + "epoch": 1.3908412036020206, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.51568603515625, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7036019563674927, + "num_tokens": 327707176.0, + "step": 12665 + }, + { + "epoch": 1.3909510213046343, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3200974464416504, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6911200284957886, + "num_tokens": 327734720.0, + "step": 12666 + }, + { + "epoch": 1.391060839007248, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.434335470199585, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7144721746444702, + "num_tokens": 327760150.0, + "step": 12667 + }, + { + "epoch": 1.3911706567098616, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.372025489807129, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7116974592208862, + "num_tokens": 327784149.0, + "step": 12668 + }, + { + "epoch": 1.3912804744124752, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4170382022857666, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7220637798309326, + "num_tokens": 327808125.0, + "step": 12669 + }, + { + "epoch": 1.391390292115089, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5288987159729004, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7243021726608276, + "num_tokens": 327830077.0, + "step": 12670 + }, + { + "epoch": 1.3915001098177027, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.292623281478882, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7090634107589722, + "num_tokens": 327857461.0, + "step": 12671 + }, + { + "epoch": 1.3916099275203162, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.657111167907715, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7171192169189453, + "num_tokens": 327877176.0, + "step": 12672 + }, + { + "epoch": 1.39171974522293, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.8893988132476807, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7358562350273132, + "num_tokens": 327895395.0, + "step": 12673 + }, + { + "epoch": 1.3918295629255435, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1124446392059326, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7164709568023682, + "num_tokens": 327927128.0, + "step": 12674 + }, + { + "epoch": 1.3919393806281573, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4445278644561768, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7334474325180054, + "num_tokens": 327951863.0, + "step": 12675 + }, + { + "epoch": 1.392049198330771, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.394838809967041, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7254483103752136, + "num_tokens": 327975457.0, + "step": 12676 + }, + { + "epoch": 1.3921590160333845, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.411043405532837, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7012702226638794, + "num_tokens": 328001139.0, + "step": 12677 + }, + { + "epoch": 1.3922688337359983, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3623218536376953, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6983401775360107, + "num_tokens": 328027431.0, + "step": 12678 + }, + { + "epoch": 1.3923786514386118, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3689022064208984, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6923620104789734, + "num_tokens": 328054469.0, + "step": 12679 + }, + { + "epoch": 1.3924884691412256, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1446828842163086, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7016662359237671, + "num_tokens": 328086474.0, + "step": 12680 + }, + { + "epoch": 1.3925982868438394, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.325604200363159, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6995164155960083, + "num_tokens": 328114272.0, + "step": 12681 + }, + { + "epoch": 1.3927081045464529, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3830811977386475, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7050865292549133, + "num_tokens": 328139879.0, + "step": 12682 + }, + { + "epoch": 1.3928179222490664, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.557241916656494, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6960829496383667, + "num_tokens": 328163458.0, + "step": 12683 + }, + { + "epoch": 1.3929277399516802, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.367671489715576, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7105521559715271, + "num_tokens": 328189557.0, + "step": 12684 + }, + { + "epoch": 1.393037557654294, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3228931427001953, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7024780511856079, + "num_tokens": 328216781.0, + "step": 12685 + }, + { + "epoch": 1.3931473753569075, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2266435623168945, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7197698950767517, + "num_tokens": 328248094.0, + "step": 12686 + }, + { + "epoch": 1.3932571930595212, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1464149951934814, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6902724504470825, + "num_tokens": 328282358.0, + "step": 12687 + }, + { + "epoch": 1.3933670107621348, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.0258193016052246, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6917151212692261, + "num_tokens": 328318452.0, + "step": 12688 + }, + { + "epoch": 1.3934768284647485, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.340204954147339, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6944227814674377, + "num_tokens": 328344819.0, + "step": 12689 + }, + { + "epoch": 1.3935866461673623, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.675037384033203, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6920259594917297, + "num_tokens": 328366894.0, + "step": 12690 + }, + { + "epoch": 1.3936964638699758, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.455402374267578, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7021353840827942, + "num_tokens": 328393601.0, + "step": 12691 + }, + { + "epoch": 1.3938062815725896, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3388960361480713, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7158467769622803, + "num_tokens": 328419230.0, + "step": 12692 + }, + { + "epoch": 1.393916099275203, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5158464908599854, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7028871178627014, + "num_tokens": 328443372.0, + "step": 12693 + }, + { + "epoch": 1.3940259169778169, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.511582612991333, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7190817594528198, + "num_tokens": 328465375.0, + "step": 12694 + }, + { + "epoch": 1.3941357346804306, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.252643346786499, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7080690860748291, + "num_tokens": 328497099.0, + "step": 12695 + }, + { + "epoch": 1.3942455523830442, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4650559425354004, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7372563481330872, + "num_tokens": 328517455.0, + "step": 12696 + }, + { + "epoch": 1.3943553700856577, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.353516101837158, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6964238882064819, + "num_tokens": 328542932.0, + "step": 12697 + }, + { + "epoch": 1.3944651877882714, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5321009159088135, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7205114364624023, + "num_tokens": 328566209.0, + "step": 12698 + }, + { + "epoch": 1.3945750054908852, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3316407203674316, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7099168300628662, + "num_tokens": 328591339.0, + "step": 12699 + }, + { + "epoch": 1.3946848231934987, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.0707404613494873, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6852149963378906, + "num_tokens": 328624959.0, + "step": 12700 + }, + { + "epoch": 1.3947946408961125, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4057860374450684, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6830816864967346, + "num_tokens": 328653922.0, + "step": 12701 + }, + { + "epoch": 1.394904458598726, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.685595989227295, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7211428880691528, + "num_tokens": 328674331.0, + "step": 12702 + }, + { + "epoch": 1.3950142763013398, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.39034104347229, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7252095937728882, + "num_tokens": 328698819.0, + "step": 12703 + }, + { + "epoch": 1.3951240940039535, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2396340370178223, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7075275182723999, + "num_tokens": 328727751.0, + "step": 12704 + }, + { + "epoch": 1.395233911706567, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.615309953689575, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7270995378494263, + "num_tokens": 328749952.0, + "step": 12705 + }, + { + "epoch": 1.3953437294091808, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3227345943450928, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7344101667404175, + "num_tokens": 328773941.0, + "step": 12706 + }, + { + "epoch": 1.3954535471117944, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4915366172790527, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6896119713783264, + "num_tokens": 328799129.0, + "step": 12707 + }, + { + "epoch": 1.3955633648144081, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3118882179260254, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7088854312896729, + "num_tokens": 328826607.0, + "step": 12708 + }, + { + "epoch": 1.3956731825170219, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.579883575439453, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7331463098526001, + "num_tokens": 328851609.0, + "step": 12709 + }, + { + "epoch": 1.3957830002196354, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2826087474823, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7070013880729675, + "num_tokens": 328878492.0, + "step": 12710 + }, + { + "epoch": 1.395892817922249, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.515523672103882, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7142465114593506, + "num_tokens": 328903482.0, + "step": 12711 + }, + { + "epoch": 1.3960026356248627, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.520470142364502, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7173416614532471, + "num_tokens": 328925219.0, + "step": 12712 + }, + { + "epoch": 1.3961124533274765, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.357356309890747, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7055734395980835, + "num_tokens": 328949992.0, + "step": 12713 + }, + { + "epoch": 1.39622227103009, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.719738006591797, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7231071591377258, + "num_tokens": 328972712.0, + "step": 12714 + }, + { + "epoch": 1.3963320887327038, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.393609046936035, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6925246119499207, + "num_tokens": 329000792.0, + "step": 12715 + }, + { + "epoch": 1.3964419064353173, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2716054916381836, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7180868983268738, + "num_tokens": 329028808.0, + "step": 12716 + }, + { + "epoch": 1.396551724137931, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3557024002075195, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7150255441665649, + "num_tokens": 329056023.0, + "step": 12717 + }, + { + "epoch": 1.3966615418405448, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.216560125350952, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6928234100341797, + "num_tokens": 329087466.0, + "step": 12718 + }, + { + "epoch": 1.3967713595431583, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.350939989089966, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7098846435546875, + "num_tokens": 329113202.0, + "step": 12719 + }, + { + "epoch": 1.396881177245772, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.537902355194092, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6960371136665344, + "num_tokens": 329138724.0, + "step": 12720 + }, + { + "epoch": 1.3969909949483856, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3906495571136475, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6937235593795776, + "num_tokens": 329165168.0, + "step": 12721 + }, + { + "epoch": 1.3971008126509994, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.441962480545044, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6997600197792053, + "num_tokens": 329189652.0, + "step": 12722 + }, + { + "epoch": 1.397210630353613, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.509305715560913, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7432893514633179, + "num_tokens": 329211107.0, + "step": 12723 + }, + { + "epoch": 1.3973204480562267, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.658534049987793, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.7021820545196533, + "num_tokens": 329233763.0, + "step": 12724 + }, + { + "epoch": 1.3974302657588402, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5829782485961914, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7081553936004639, + "num_tokens": 329256232.0, + "step": 12725 + }, + { + "epoch": 1.397540083461454, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5782692432403564, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7109757661819458, + "num_tokens": 329278118.0, + "step": 12726 + }, + { + "epoch": 1.3976499011640677, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2678449153900146, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7349886298179626, + "num_tokens": 329306441.0, + "step": 12727 + }, + { + "epoch": 1.3977597188666813, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6120829582214355, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6952725648880005, + "num_tokens": 329338047.0, + "step": 12728 + }, + { + "epoch": 1.397869536569295, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5329983234405518, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7251741886138916, + "num_tokens": 329361361.0, + "step": 12729 + }, + { + "epoch": 1.3979793542719086, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6610822677612305, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7197461128234863, + "num_tokens": 329382056.0, + "step": 12730 + }, + { + "epoch": 1.3980891719745223, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6839327812194824, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7216213345527649, + "num_tokens": 329404022.0, + "step": 12731 + }, + { + "epoch": 1.398198989677136, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3762543201446533, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7014865279197693, + "num_tokens": 329430905.0, + "step": 12732 + }, + { + "epoch": 1.3983088073797496, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.51533842086792, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7129924893379211, + "num_tokens": 329453502.0, + "step": 12733 + }, + { + "epoch": 1.3984186250823631, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3355038166046143, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6971864104270935, + "num_tokens": 329482453.0, + "step": 12734 + }, + { + "epoch": 1.398528442784977, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.178399085998535, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6858452558517456, + "num_tokens": 329514440.0, + "step": 12735 + }, + { + "epoch": 1.3986382604875907, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.657585620880127, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7509117722511292, + "num_tokens": 329534295.0, + "step": 12736 + }, + { + "epoch": 1.3987480781902042, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.600705862045288, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6997894048690796, + "num_tokens": 329555527.0, + "step": 12737 + }, + { + "epoch": 1.398857895892818, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2152745723724365, + "learning_rate": 1e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6737622618675232, + "num_tokens": 329586322.0, + "step": 12738 + }, + { + "epoch": 1.3989677135954315, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4617691040039062, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7111979126930237, + "num_tokens": 329611497.0, + "step": 12739 + }, + { + "epoch": 1.3990775312980452, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4294095039367676, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7148470282554626, + "num_tokens": 329636836.0, + "step": 12740 + }, + { + "epoch": 1.399187349000659, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6167993545532227, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7108455896377563, + "num_tokens": 329661010.0, + "step": 12741 + }, + { + "epoch": 1.3992971667032725, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6494333744049072, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7131850123405457, + "num_tokens": 329682644.0, + "step": 12742 + }, + { + "epoch": 1.3994069844058863, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3749208450317383, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7171055674552917, + "num_tokens": 329707774.0, + "step": 12743 + }, + { + "epoch": 1.3995168021084998, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.310670852661133, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7120342254638672, + "num_tokens": 329735999.0, + "step": 12744 + }, + { + "epoch": 1.3996266198111136, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.313464403152466, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6878985166549683, + "num_tokens": 329765151.0, + "step": 12745 + }, + { + "epoch": 1.3997364375137273, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.330821990966797, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7293472290039062, + "num_tokens": 329791205.0, + "step": 12746 + }, + { + "epoch": 1.3998462552163409, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4349400997161865, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6867694854736328, + "num_tokens": 329815303.0, + "step": 12747 + }, + { + "epoch": 1.3999560729189544, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4807658195495605, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7249169945716858, + "num_tokens": 329838161.0, + "step": 12748 + }, + { + "epoch": 1.4000658906215682, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.172220468521118, + "learning_rate": 1e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6890390515327454, + "num_tokens": 329869874.0, + "step": 12749 + }, + { + "epoch": 1.400175708324182, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5982329845428467, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7352875471115112, + "num_tokens": 329890223.0, + "step": 12750 + }, + { + "epoch": 1.4002855260267955, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3128867149353027, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7067270278930664, + "num_tokens": 329918639.0, + "step": 12751 + }, + { + "epoch": 1.4003953437294092, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.549637794494629, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7040348649024963, + "num_tokens": 329943018.0, + "step": 12752 + }, + { + "epoch": 1.4005051614320227, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5334112644195557, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7143387198448181, + "num_tokens": 329968118.0, + "step": 12753 + }, + { + "epoch": 1.4006149791346365, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.267805576324463, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7195712327957153, + "num_tokens": 329996407.0, + "step": 12754 + }, + { + "epoch": 1.4007247968372503, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4006941318511963, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6951366066932678, + "num_tokens": 330020797.0, + "step": 12755 + }, + { + "epoch": 1.4008346145398638, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3974123001098633, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7056765556335449, + "num_tokens": 330046151.0, + "step": 12756 + }, + { + "epoch": 1.4009444322424776, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 7.068577289581299, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.729649543762207, + "num_tokens": 330067459.0, + "step": 12757 + }, + { + "epoch": 1.401054249945091, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.445819854736328, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7264581918716431, + "num_tokens": 330090836.0, + "step": 12758 + }, + { + "epoch": 1.4011640676477048, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.8633837699890137, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.686204195022583, + "num_tokens": 330112581.0, + "step": 12759 + }, + { + "epoch": 1.4012738853503186, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6684787273406982, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7062070369720459, + "num_tokens": 330134770.0, + "step": 12760 + }, + { + "epoch": 1.4013837030529321, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4060850143432617, + "learning_rate": 1e-06, + "loss": 1.1143, + "mean_token_accuracy": 0.681624174118042, + "num_tokens": 330162305.0, + "step": 12761 + }, + { + "epoch": 1.4014935207555457, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.127110242843628, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6864749193191528, + "num_tokens": 330195513.0, + "step": 12762 + }, + { + "epoch": 1.4016033384581594, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.292060613632202, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6944137215614319, + "num_tokens": 330224612.0, + "step": 12763 + }, + { + "epoch": 1.4017131561607732, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6657392978668213, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.695487916469574, + "num_tokens": 330245872.0, + "step": 12764 + }, + { + "epoch": 1.4018229738633867, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.946416139602661, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7203983068466187, + "num_tokens": 330263478.0, + "step": 12765 + }, + { + "epoch": 1.4019327915660005, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.259833812713623, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.7117959260940552, + "num_tokens": 330292579.0, + "step": 12766 + }, + { + "epoch": 1.402042609268614, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.523711681365967, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7069829702377319, + "num_tokens": 330315212.0, + "step": 12767 + }, + { + "epoch": 1.4021524269712278, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2735681533813477, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7050625085830688, + "num_tokens": 330343511.0, + "step": 12768 + }, + { + "epoch": 1.4022622446738415, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3640754222869873, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7170590758323669, + "num_tokens": 330371013.0, + "step": 12769 + }, + { + "epoch": 1.402372062376455, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6433019638061523, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7407673001289368, + "num_tokens": 330390565.0, + "step": 12770 + }, + { + "epoch": 1.4024818800790688, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4802536964416504, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7317689657211304, + "num_tokens": 330414047.0, + "step": 12771 + }, + { + "epoch": 1.4025916977816824, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.452226161956787, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7142225503921509, + "num_tokens": 330440831.0, + "step": 12772 + }, + { + "epoch": 1.4027015154842961, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.638394832611084, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7390372157096863, + "num_tokens": 330460607.0, + "step": 12773 + }, + { + "epoch": 1.4028113331869096, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.141650915145874, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6874710321426392, + "num_tokens": 330493053.0, + "step": 12774 + }, + { + "epoch": 1.4029211508895234, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.034282922744751, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7111510634422302, + "num_tokens": 330525499.0, + "step": 12775 + }, + { + "epoch": 1.403030968592137, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3702898025512695, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6786497831344604, + "num_tokens": 330552529.0, + "step": 12776 + }, + { + "epoch": 1.4031407862947507, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1174561977386475, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6873747110366821, + "num_tokens": 330584224.0, + "step": 12777 + }, + { + "epoch": 1.4032506039973645, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.471656560897827, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7075667381286621, + "num_tokens": 330608743.0, + "step": 12778 + }, + { + "epoch": 1.403360421699978, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.264547348022461, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7082961201667786, + "num_tokens": 330637403.0, + "step": 12779 + }, + { + "epoch": 1.4034702394025917, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1996707916259766, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7057470083236694, + "num_tokens": 330669501.0, + "step": 12780 + }, + { + "epoch": 1.4035800571052053, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2106783390045166, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7008559703826904, + "num_tokens": 330701549.0, + "step": 12781 + }, + { + "epoch": 1.403689874807819, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.220200300216675, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6932789087295532, + "num_tokens": 330729181.0, + "step": 12782 + }, + { + "epoch": 1.4037996925104328, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.306879997253418, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6900676488876343, + "num_tokens": 330757862.0, + "step": 12783 + }, + { + "epoch": 1.4039095102130463, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.39656138420105, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7062561511993408, + "num_tokens": 330783695.0, + "step": 12784 + }, + { + "epoch": 1.40401932791566, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.064842462539673, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6988485455513, + "num_tokens": 330817814.0, + "step": 12785 + }, + { + "epoch": 1.4041291456182736, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2819137573242188, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7184890508651733, + "num_tokens": 330844176.0, + "step": 12786 + }, + { + "epoch": 1.4042389633208874, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3740644454956055, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6977957487106323, + "num_tokens": 330870796.0, + "step": 12787 + }, + { + "epoch": 1.404348781023501, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.440122365951538, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7090113759040833, + "num_tokens": 330894438.0, + "step": 12788 + }, + { + "epoch": 1.4044585987261147, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4489333629608154, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6993634104728699, + "num_tokens": 330921480.0, + "step": 12789 + }, + { + "epoch": 1.4045684164287282, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5176236629486084, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7001611590385437, + "num_tokens": 330946772.0, + "step": 12790 + }, + { + "epoch": 1.404678234131342, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.254945755004883, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7094584703445435, + "num_tokens": 330979370.0, + "step": 12791 + }, + { + "epoch": 1.4047880518339557, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.432636260986328, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6942662000656128, + "num_tokens": 331002954.0, + "step": 12792 + }, + { + "epoch": 1.4048978695365693, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.0866048336029053, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6881974935531616, + "num_tokens": 331035129.0, + "step": 12793 + }, + { + "epoch": 1.405007687239183, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5015757083892822, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.703162670135498, + "num_tokens": 331059434.0, + "step": 12794 + }, + { + "epoch": 1.4051175049417965, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.692619562149048, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6961180567741394, + "num_tokens": 331080140.0, + "step": 12795 + }, + { + "epoch": 1.4052273226444103, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.275616407394409, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7182387113571167, + "num_tokens": 331106893.0, + "step": 12796 + }, + { + "epoch": 1.405337140347024, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7107107639312744, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7395214438438416, + "num_tokens": 331126529.0, + "step": 12797 + }, + { + "epoch": 1.4054469580496376, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.178050994873047, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6970712542533875, + "num_tokens": 331158566.0, + "step": 12798 + }, + { + "epoch": 1.4055567757522511, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.237987756729126, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7083476185798645, + "num_tokens": 331186311.0, + "step": 12799 + }, + { + "epoch": 1.4056665934548649, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.153120994567871, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7318426370620728, + "num_tokens": 331215102.0, + "step": 12800 + }, + { + "epoch": 1.4057764111574786, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.597656726837158, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7022483348846436, + "num_tokens": 331236980.0, + "step": 12801 + }, + { + "epoch": 1.4058862288600922, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4970703125, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6899784803390503, + "num_tokens": 331261948.0, + "step": 12802 + }, + { + "epoch": 1.405996046562706, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3862154483795166, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7045159339904785, + "num_tokens": 331288192.0, + "step": 12803 + }, + { + "epoch": 1.4061058642653195, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.168755292892456, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.709661602973938, + "num_tokens": 331317084.0, + "step": 12804 + }, + { + "epoch": 1.4062156819679332, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1370747089385986, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.727414608001709, + "num_tokens": 331345302.0, + "step": 12805 + }, + { + "epoch": 1.406325499670547, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2657904624938965, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7116351127624512, + "num_tokens": 331372262.0, + "step": 12806 + }, + { + "epoch": 1.4064353173731605, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2671453952789307, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7182056903839111, + "num_tokens": 331398490.0, + "step": 12807 + }, + { + "epoch": 1.4065451350757743, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.376809597015381, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6856093406677246, + "num_tokens": 331424470.0, + "step": 12808 + }, + { + "epoch": 1.4066549527783878, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4988245964050293, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6972188353538513, + "num_tokens": 331447838.0, + "step": 12809 + }, + { + "epoch": 1.4067647704810016, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6558003425598145, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7042886018753052, + "num_tokens": 331470010.0, + "step": 12810 + }, + { + "epoch": 1.4068745881836153, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.349867105484009, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6936891078948975, + "num_tokens": 331496230.0, + "step": 12811 + }, + { + "epoch": 1.4069844058862289, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3411612510681152, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.684828519821167, + "num_tokens": 331525604.0, + "step": 12812 + }, + { + "epoch": 1.4070942235888424, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.261146068572998, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7058529853820801, + "num_tokens": 331553471.0, + "step": 12813 + }, + { + "epoch": 1.4072040412914562, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.502544641494751, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7277287840843201, + "num_tokens": 331575373.0, + "step": 12814 + }, + { + "epoch": 1.40731385899407, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.232705593109131, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7059106826782227, + "num_tokens": 331604945.0, + "step": 12815 + }, + { + "epoch": 1.4074236766966834, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3288414478302, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6921133995056152, + "num_tokens": 331632112.0, + "step": 12816 + }, + { + "epoch": 1.4075334943992972, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.19901180267334, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7118419408798218, + "num_tokens": 331659631.0, + "step": 12817 + }, + { + "epoch": 1.4076433121019107, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.556854248046875, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7290142774581909, + "num_tokens": 331681471.0, + "step": 12818 + }, + { + "epoch": 1.4077531298045245, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5288918018341064, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7271082401275635, + "num_tokens": 331704106.0, + "step": 12819 + }, + { + "epoch": 1.4078629475071383, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1241965293884277, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6869430541992188, + "num_tokens": 331736036.0, + "step": 12820 + }, + { + "epoch": 1.4079727652097518, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4179487228393555, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7061697244644165, + "num_tokens": 331761772.0, + "step": 12821 + }, + { + "epoch": 1.4080825829123655, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.444202423095703, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7181063294410706, + "num_tokens": 331785723.0, + "step": 12822 + }, + { + "epoch": 1.408192400614979, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.494924545288086, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.714353621006012, + "num_tokens": 331809006.0, + "step": 12823 + }, + { + "epoch": 1.4083022183175928, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6207361221313477, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7301650047302246, + "num_tokens": 331830095.0, + "step": 12824 + }, + { + "epoch": 1.4084120360202066, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5139832496643066, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7377012968063354, + "num_tokens": 331852970.0, + "step": 12825 + }, + { + "epoch": 1.4085218537228201, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2208244800567627, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7228877544403076, + "num_tokens": 331880021.0, + "step": 12826 + }, + { + "epoch": 1.4086316714254337, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2335171699523926, + "learning_rate": 1e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.6830809116363525, + "num_tokens": 331909539.0, + "step": 12827 + }, + { + "epoch": 1.4087414891280474, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.291295289993286, + "learning_rate": 1e-06, + "loss": 1.0977, + "mean_token_accuracy": 0.6813471913337708, + "num_tokens": 331938496.0, + "step": 12828 + }, + { + "epoch": 1.4088513068306612, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.0958974361419678, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7256346344947815, + "num_tokens": 331970350.0, + "step": 12829 + }, + { + "epoch": 1.4089611245332747, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5818963050842285, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7098278999328613, + "num_tokens": 331991869.0, + "step": 12830 + }, + { + "epoch": 1.4090709422358885, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6150221824645996, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7151201367378235, + "num_tokens": 332013233.0, + "step": 12831 + }, + { + "epoch": 1.409180759938502, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.293714761734009, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7015396952629089, + "num_tokens": 332039007.0, + "step": 12832 + }, + { + "epoch": 1.4092905776411158, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.332017183303833, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7059988975524902, + "num_tokens": 332066044.0, + "step": 12833 + }, + { + "epoch": 1.4094003953437295, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.334965705871582, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7370975017547607, + "num_tokens": 332089319.0, + "step": 12834 + }, + { + "epoch": 1.409510213046343, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7745554447174072, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7117817401885986, + "num_tokens": 332109374.0, + "step": 12835 + }, + { + "epoch": 1.4096200307489568, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.29490065574646, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7017567157745361, + "num_tokens": 332137601.0, + "step": 12836 + }, + { + "epoch": 1.4097298484515703, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4077353477478027, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.703811526298523, + "num_tokens": 332162374.0, + "step": 12837 + }, + { + "epoch": 1.409839666154184, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2235395908355713, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7024009227752686, + "num_tokens": 332191028.0, + "step": 12838 + }, + { + "epoch": 1.4099494838567976, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5812604427337646, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7149080634117126, + "num_tokens": 332212579.0, + "step": 12839 + }, + { + "epoch": 1.4100593015594114, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.318786382675171, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7156184911727905, + "num_tokens": 332240660.0, + "step": 12840 + }, + { + "epoch": 1.410169119262025, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.247187376022339, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.700141191482544, + "num_tokens": 332271878.0, + "step": 12841 + }, + { + "epoch": 1.4102789369646387, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2314679622650146, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7158609628677368, + "num_tokens": 332300385.0, + "step": 12842 + }, + { + "epoch": 1.4103887546672524, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4431991577148438, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7151857018470764, + "num_tokens": 332324927.0, + "step": 12843 + }, + { + "epoch": 1.410498572369866, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6084585189819336, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7130018472671509, + "num_tokens": 332347441.0, + "step": 12844 + }, + { + "epoch": 1.4106083900724797, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.690781831741333, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7123044729232788, + "num_tokens": 332367849.0, + "step": 12845 + }, + { + "epoch": 1.4107182077750933, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.46909236907959, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7016229629516602, + "num_tokens": 332394262.0, + "step": 12846 + }, + { + "epoch": 1.410828025477707, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5015029907226562, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6918076276779175, + "num_tokens": 332417902.0, + "step": 12847 + }, + { + "epoch": 1.4109378431803208, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1738736629486084, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6992843151092529, + "num_tokens": 332450973.0, + "step": 12848 + }, + { + "epoch": 1.4110476608829343, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4969913959503174, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7098334431648254, + "num_tokens": 332474891.0, + "step": 12849 + }, + { + "epoch": 1.4111574785855479, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3965260982513428, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7057794332504272, + "num_tokens": 332501341.0, + "step": 12850 + }, + { + "epoch": 1.4112672962881616, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.385099172592163, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7023211717605591, + "num_tokens": 332529082.0, + "step": 12851 + }, + { + "epoch": 1.4113771139907754, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4807825088500977, + "learning_rate": 1e-06, + "loss": 1.1101, + "mean_token_accuracy": 0.6808076500892639, + "num_tokens": 332556574.0, + "step": 12852 + }, + { + "epoch": 1.411486931693389, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4624979496002197, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7059040069580078, + "num_tokens": 332579569.0, + "step": 12853 + }, + { + "epoch": 1.4115967493960027, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2275896072387695, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6963356733322144, + "num_tokens": 332611878.0, + "step": 12854 + }, + { + "epoch": 1.4117065670986162, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.388798236846924, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6976152658462524, + "num_tokens": 332638058.0, + "step": 12855 + }, + { + "epoch": 1.41181638480123, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2893567085266113, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6899499297142029, + "num_tokens": 332666000.0, + "step": 12856 + }, + { + "epoch": 1.4119262025038437, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2889809608459473, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7168183326721191, + "num_tokens": 332694149.0, + "step": 12857 + }, + { + "epoch": 1.4120360202064572, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3805902004241943, + "learning_rate": 1e-06, + "loss": 1.0856, + "mean_token_accuracy": 0.680614173412323, + "num_tokens": 332724188.0, + "step": 12858 + }, + { + "epoch": 1.412145837909071, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.309769630432129, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6911945343017578, + "num_tokens": 332752306.0, + "step": 12859 + }, + { + "epoch": 1.4122556556116845, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.425055503845215, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.700808048248291, + "num_tokens": 332780115.0, + "step": 12860 + }, + { + "epoch": 1.4123654733142983, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7402095794677734, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7069140672683716, + "num_tokens": 332800506.0, + "step": 12861 + }, + { + "epoch": 1.412475291016912, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.282626152038574, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7045258283615112, + "num_tokens": 332830867.0, + "step": 12862 + }, + { + "epoch": 1.4125851087195256, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3076205253601074, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.675910234451294, + "num_tokens": 332860462.0, + "step": 12863 + }, + { + "epoch": 1.4126949264221391, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.65509033203125, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7208470106124878, + "num_tokens": 332880760.0, + "step": 12864 + }, + { + "epoch": 1.4128047441247529, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.268594741821289, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6944342851638794, + "num_tokens": 332910473.0, + "step": 12865 + }, + { + "epoch": 1.4129145618273666, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.581155300140381, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6890766620635986, + "num_tokens": 332936802.0, + "step": 12866 + }, + { + "epoch": 1.4130243795299802, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.483865976333618, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7279482483863831, + "num_tokens": 332959432.0, + "step": 12867 + }, + { + "epoch": 1.413134197232594, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3034918308258057, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6901348829269409, + "num_tokens": 332987546.0, + "step": 12868 + }, + { + "epoch": 1.4132440149352075, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4431939125061035, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6904861330986023, + "num_tokens": 333014776.0, + "step": 12869 + }, + { + "epoch": 1.4133538326378212, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4308393001556396, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7059903740882874, + "num_tokens": 333040984.0, + "step": 12870 + }, + { + "epoch": 1.413463650340435, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.488117218017578, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7215607166290283, + "num_tokens": 333064260.0, + "step": 12871 + }, + { + "epoch": 1.4135734680430485, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.662923812866211, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7212547063827515, + "num_tokens": 333085230.0, + "step": 12872 + }, + { + "epoch": 1.4136832857456623, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.357861042022705, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6978837847709656, + "num_tokens": 333111202.0, + "step": 12873 + }, + { + "epoch": 1.4137931034482758, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.376586675643921, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7024788856506348, + "num_tokens": 333135562.0, + "step": 12874 + }, + { + "epoch": 1.4139029211508896, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3697803020477295, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7056454420089722, + "num_tokens": 333162233.0, + "step": 12875 + }, + { + "epoch": 1.4140127388535033, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3916587829589844, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7095747590065002, + "num_tokens": 333187389.0, + "step": 12876 + }, + { + "epoch": 1.4141225565561168, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.419546127319336, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7108913064002991, + "num_tokens": 333216193.0, + "step": 12877 + }, + { + "epoch": 1.4142323742587304, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6700165271759033, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7197052240371704, + "num_tokens": 333237036.0, + "step": 12878 + }, + { + "epoch": 1.4143421919613441, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.378448009490967, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7182769179344177, + "num_tokens": 333261687.0, + "step": 12879 + }, + { + "epoch": 1.414452009663958, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6265342235565186, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.7062232494354248, + "num_tokens": 333285475.0, + "step": 12880 + }, + { + "epoch": 1.4145618273665714, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5334177017211914, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6980224251747131, + "num_tokens": 333308813.0, + "step": 12881 + }, + { + "epoch": 1.4146716450691852, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2996561527252197, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7077799439430237, + "num_tokens": 333335279.0, + "step": 12882 + }, + { + "epoch": 1.4147814627717987, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.392963171005249, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6991549730300903, + "num_tokens": 333364811.0, + "step": 12883 + }, + { + "epoch": 1.4148912804744125, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.780529022216797, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7174935936927795, + "num_tokens": 333385365.0, + "step": 12884 + }, + { + "epoch": 1.4150010981770262, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.80159592628479, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7214856743812561, + "num_tokens": 333405477.0, + "step": 12885 + }, + { + "epoch": 1.4151109158796398, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4723150730133057, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7096522450447083, + "num_tokens": 333429113.0, + "step": 12886 + }, + { + "epoch": 1.4152207335822535, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.496394395828247, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7215511202812195, + "num_tokens": 333451628.0, + "step": 12887 + }, + { + "epoch": 1.415330551284867, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7103781700134277, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7329710721969604, + "num_tokens": 333469094.0, + "step": 12888 + }, + { + "epoch": 1.4154403689874808, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5321478843688965, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7355977296829224, + "num_tokens": 333492280.0, + "step": 12889 + }, + { + "epoch": 1.4155501866900946, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.409677743911743, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6945246458053589, + "num_tokens": 333518127.0, + "step": 12890 + }, + { + "epoch": 1.4156600043927081, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7651147842407227, + "learning_rate": 1e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.6739565134048462, + "num_tokens": 333546384.0, + "step": 12891 + }, + { + "epoch": 1.4157698220953217, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5105090141296387, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7070568799972534, + "num_tokens": 333571273.0, + "step": 12892 + }, + { + "epoch": 1.4158796397979354, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 7.094906330108643, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7173594832420349, + "num_tokens": 333594588.0, + "step": 12893 + }, + { + "epoch": 1.4159894575005492, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1522481441497803, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7312606573104858, + "num_tokens": 333622666.0, + "step": 12894 + }, + { + "epoch": 1.4160992752031627, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4501988887786865, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.725292980670929, + "num_tokens": 333644976.0, + "step": 12895 + }, + { + "epoch": 1.4162090929057765, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.385676860809326, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.7051125764846802, + "num_tokens": 333673260.0, + "step": 12896 + }, + { + "epoch": 1.41631891060839, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3743762969970703, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6815088987350464, + "num_tokens": 333701124.0, + "step": 12897 + }, + { + "epoch": 1.4164287283110037, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.388413667678833, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7066983580589294, + "num_tokens": 333728031.0, + "step": 12898 + }, + { + "epoch": 1.4165385460136175, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3848633766174316, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7257628440856934, + "num_tokens": 333753218.0, + "step": 12899 + }, + { + "epoch": 1.416648363716231, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3845434188842773, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7070288062095642, + "num_tokens": 333780133.0, + "step": 12900 + }, + { + "epoch": 1.4167581814188448, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.424440383911133, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6995996236801147, + "num_tokens": 333805585.0, + "step": 12901 + }, + { + "epoch": 1.4168679991214583, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 8.515393257141113, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7004554867744446, + "num_tokens": 333832788.0, + "step": 12902 + }, + { + "epoch": 1.416977816824072, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2822537422180176, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7170149087905884, + "num_tokens": 333861074.0, + "step": 12903 + }, + { + "epoch": 1.4170876345266856, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.679534912109375, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7068860530853271, + "num_tokens": 333883421.0, + "step": 12904 + }, + { + "epoch": 1.4171974522292994, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6369469165802, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.74765944480896, + "num_tokens": 333902876.0, + "step": 12905 + }, + { + "epoch": 1.417307269931913, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.28413462638855, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7060068845748901, + "num_tokens": 333932878.0, + "step": 12906 + }, + { + "epoch": 1.4174170876345267, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.176926374435425, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7063276171684265, + "num_tokens": 333964610.0, + "step": 12907 + }, + { + "epoch": 1.4175269053371404, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4654436111450195, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6966729760169983, + "num_tokens": 333990179.0, + "step": 12908 + }, + { + "epoch": 1.417636723039754, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.506117820739746, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7195468544960022, + "num_tokens": 334014098.0, + "step": 12909 + }, + { + "epoch": 1.4177465407423677, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6033072471618652, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7143087387084961, + "num_tokens": 334035745.0, + "step": 12910 + }, + { + "epoch": 1.4178563584449813, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.462843418121338, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7099951505661011, + "num_tokens": 334063259.0, + "step": 12911 + }, + { + "epoch": 1.417966176147595, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1583759784698486, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7089611291885376, + "num_tokens": 334092798.0, + "step": 12912 + }, + { + "epoch": 1.4180759938502088, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.255605697631836, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7180953025817871, + "num_tokens": 334121983.0, + "step": 12913 + }, + { + "epoch": 1.4181858115528223, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3068532943725586, + "learning_rate": 1e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6893866062164307, + "num_tokens": 334150366.0, + "step": 12914 + }, + { + "epoch": 1.4182956292554358, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.489800453186035, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7187460660934448, + "num_tokens": 334173217.0, + "step": 12915 + }, + { + "epoch": 1.4184054469580496, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.201145887374878, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6852356195449829, + "num_tokens": 334203261.0, + "step": 12916 + }, + { + "epoch": 1.4185152646606634, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.456789016723633, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6910624504089355, + "num_tokens": 334229166.0, + "step": 12917 + }, + { + "epoch": 1.418625082363277, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.250465154647827, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6854956746101379, + "num_tokens": 334258769.0, + "step": 12918 + }, + { + "epoch": 1.4187349000658906, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.211264133453369, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.7043758034706116, + "num_tokens": 334287469.0, + "step": 12919 + }, + { + "epoch": 1.4188447177685042, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3253321647644043, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7073291540145874, + "num_tokens": 334314416.0, + "step": 12920 + }, + { + "epoch": 1.418954535471118, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2825944423675537, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7208232879638672, + "num_tokens": 334339914.0, + "step": 12921 + }, + { + "epoch": 1.4190643531737317, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5462019443511963, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.711640477180481, + "num_tokens": 334362130.0, + "step": 12922 + }, + { + "epoch": 1.4191741708763452, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.0689775943756104, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6970474123954773, + "num_tokens": 334394745.0, + "step": 12923 + }, + { + "epoch": 1.419283988578959, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2776591777801514, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7150572538375854, + "num_tokens": 334422706.0, + "step": 12924 + }, + { + "epoch": 1.4193938062815725, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.339958667755127, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7002103924751282, + "num_tokens": 334448475.0, + "step": 12925 + }, + { + "epoch": 1.4195036239841863, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.29986572265625, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7195266485214233, + "num_tokens": 334473179.0, + "step": 12926 + }, + { + "epoch": 1.4196134416868, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.390770196914673, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7100348472595215, + "num_tokens": 334499427.0, + "step": 12927 + }, + { + "epoch": 1.4197232593894136, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3225505352020264, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6950223445892334, + "num_tokens": 334531301.0, + "step": 12928 + }, + { + "epoch": 1.419833077092027, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5417191982269287, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7079699039459229, + "num_tokens": 334555507.0, + "step": 12929 + }, + { + "epoch": 1.4199428947946409, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.132303476333618, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7046875953674316, + "num_tokens": 334587652.0, + "step": 12930 + }, + { + "epoch": 1.4200527124972546, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6176600456237793, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.710846483707428, + "num_tokens": 334610022.0, + "step": 12931 + }, + { + "epoch": 1.4201625301998682, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.263195037841797, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6935858726501465, + "num_tokens": 334639176.0, + "step": 12932 + }, + { + "epoch": 1.420272347902482, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6907269954681396, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7268904447555542, + "num_tokens": 334658933.0, + "step": 12933 + }, + { + "epoch": 1.4203821656050954, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5036733150482178, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6928427815437317, + "num_tokens": 334687502.0, + "step": 12934 + }, + { + "epoch": 1.4204919833077092, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.083453893661499, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6983112096786499, + "num_tokens": 334719409.0, + "step": 12935 + }, + { + "epoch": 1.420601801010323, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 6.959977149963379, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6960477828979492, + "num_tokens": 334749363.0, + "step": 12936 + }, + { + "epoch": 1.4207116187129365, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.187337875366211, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6847613453865051, + "num_tokens": 334780552.0, + "step": 12937 + }, + { + "epoch": 1.4208214364155503, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3936591148376465, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6933912634849548, + "num_tokens": 334808206.0, + "step": 12938 + }, + { + "epoch": 1.4209312541181638, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.506073474884033, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.708834707736969, + "num_tokens": 334831561.0, + "step": 12939 + }, + { + "epoch": 1.4210410718207775, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.357755661010742, + "learning_rate": 1e-06, + "loss": 1.1181, + "mean_token_accuracy": 0.6778808832168579, + "num_tokens": 334859188.0, + "step": 12940 + }, + { + "epoch": 1.4211508895233913, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4548747539520264, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7167747020721436, + "num_tokens": 334884809.0, + "step": 12941 + }, + { + "epoch": 1.4212607072260048, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3334574699401855, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7043864727020264, + "num_tokens": 334910764.0, + "step": 12942 + }, + { + "epoch": 1.4213705249286184, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6025421619415283, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.7114964127540588, + "num_tokens": 334935069.0, + "step": 12943 + }, + { + "epoch": 1.4214803426312321, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3860814571380615, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7053167819976807, + "num_tokens": 334961762.0, + "step": 12944 + }, + { + "epoch": 1.4215901603338459, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4863290786743164, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7143259644508362, + "num_tokens": 334988095.0, + "step": 12945 + }, + { + "epoch": 1.4216999780364594, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.314762830734253, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7221847772598267, + "num_tokens": 335015639.0, + "step": 12946 + }, + { + "epoch": 1.4218097957390732, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.621797561645508, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7188886404037476, + "num_tokens": 335038331.0, + "step": 12947 + }, + { + "epoch": 1.4219196134416867, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.390800952911377, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6886270642280579, + "num_tokens": 335064612.0, + "step": 12948 + }, + { + "epoch": 1.4220294311443005, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4372081756591797, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7153177261352539, + "num_tokens": 335088936.0, + "step": 12949 + }, + { + "epoch": 1.4221392488469142, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4532737731933594, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7298359274864197, + "num_tokens": 335111531.0, + "step": 12950 + }, + { + "epoch": 1.4222490665495278, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3989691734313965, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7061523795127869, + "num_tokens": 335138758.0, + "step": 12951 + }, + { + "epoch": 1.4223588842521415, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.379460573196411, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7146117687225342, + "num_tokens": 335165080.0, + "step": 12952 + }, + { + "epoch": 1.422468701954755, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5281591415405273, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6976041793823242, + "num_tokens": 335189641.0, + "step": 12953 + }, + { + "epoch": 1.4225785196573688, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.205129384994507, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6986638903617859, + "num_tokens": 335219361.0, + "step": 12954 + }, + { + "epoch": 1.4226883373599823, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.519428253173828, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7099640369415283, + "num_tokens": 335242980.0, + "step": 12955 + }, + { + "epoch": 1.422798155062596, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.6353843212127686, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6947276592254639, + "num_tokens": 335266402.0, + "step": 12956 + }, + { + "epoch": 1.4229079727652096, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.579577684402466, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7222180366516113, + "num_tokens": 335287588.0, + "step": 12957 + }, + { + "epoch": 1.4230177904678234, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4689395427703857, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7183698415756226, + "num_tokens": 335312230.0, + "step": 12958 + }, + { + "epoch": 1.4231276081704372, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1969430446624756, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7401659488677979, + "num_tokens": 335339674.0, + "step": 12959 + }, + { + "epoch": 1.4232374258730507, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.535526752471924, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7059052586555481, + "num_tokens": 335366323.0, + "step": 12960 + }, + { + "epoch": 1.4233472435756644, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.557450532913208, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.727196455001831, + "num_tokens": 335388086.0, + "step": 12961 + }, + { + "epoch": 1.423457061278278, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.248462200164795, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7229004502296448, + "num_tokens": 335416717.0, + "step": 12962 + }, + { + "epoch": 1.4235668789808917, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3206870555877686, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.7037727236747742, + "num_tokens": 335448787.0, + "step": 12963 + }, + { + "epoch": 1.4236766966835055, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4082045555114746, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7522224187850952, + "num_tokens": 335471561.0, + "step": 12964 + }, + { + "epoch": 1.423786514386119, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3261215686798096, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7110092639923096, + "num_tokens": 335498861.0, + "step": 12965 + }, + { + "epoch": 1.4238963320887328, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.802968978881836, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7206536531448364, + "num_tokens": 335518377.0, + "step": 12966 + }, + { + "epoch": 1.4240061497913463, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5617387294769287, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7240789532661438, + "num_tokens": 335539438.0, + "step": 12967 + }, + { + "epoch": 1.42411596749396, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3073503971099854, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7509604692459106, + "num_tokens": 335563992.0, + "step": 12968 + }, + { + "epoch": 1.4242257851965736, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.280613899230957, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6884477138519287, + "num_tokens": 335595342.0, + "step": 12969 + }, + { + "epoch": 1.4243356028991874, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3397932052612305, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6910531520843506, + "num_tokens": 335623167.0, + "step": 12970 + }, + { + "epoch": 1.424445420601801, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5880837440490723, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7272731065750122, + "num_tokens": 335645301.0, + "step": 12971 + }, + { + "epoch": 1.4245552383044147, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.8867921829223633, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7226241827011108, + "num_tokens": 335668265.0, + "step": 12972 + }, + { + "epoch": 1.4246650560070284, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.151212692260742, + "learning_rate": 1e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.675492525100708, + "num_tokens": 335702380.0, + "step": 12973 + }, + { + "epoch": 1.424774873709642, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4031918048858643, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7019921541213989, + "num_tokens": 335728746.0, + "step": 12974 + }, + { + "epoch": 1.4248846914122557, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2165791988372803, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7031427621841431, + "num_tokens": 335760911.0, + "step": 12975 + }, + { + "epoch": 1.4249945091148692, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.339794635772705, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7234295606613159, + "num_tokens": 335787964.0, + "step": 12976 + }, + { + "epoch": 1.425104326817483, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.668083906173706, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.712209165096283, + "num_tokens": 335808830.0, + "step": 12977 + }, + { + "epoch": 1.4252141445200968, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.216867446899414, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.698948860168457, + "num_tokens": 335838235.0, + "step": 12978 + }, + { + "epoch": 1.4253239622227103, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4858341217041016, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7101988196372986, + "num_tokens": 335861638.0, + "step": 12979 + }, + { + "epoch": 1.4254337799253238, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.404280424118042, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7034324407577515, + "num_tokens": 335887560.0, + "step": 12980 + }, + { + "epoch": 1.4255435976279376, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.173034191131592, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7027373909950256, + "num_tokens": 335916147.0, + "step": 12981 + }, + { + "epoch": 1.4256534153305513, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.29842209815979, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6873378753662109, + "num_tokens": 335945961.0, + "step": 12982 + }, + { + "epoch": 1.4257632330331649, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4691035747528076, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7183780670166016, + "num_tokens": 335968366.0, + "step": 12983 + }, + { + "epoch": 1.4258730507357786, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5263748168945312, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7286278605461121, + "num_tokens": 335989642.0, + "step": 12984 + }, + { + "epoch": 1.4259828684383922, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3185997009277344, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7044585943222046, + "num_tokens": 336017457.0, + "step": 12985 + }, + { + "epoch": 1.426092686141006, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3813254833221436, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7073472738265991, + "num_tokens": 336043602.0, + "step": 12986 + }, + { + "epoch": 1.4262025038436197, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3850927352905273, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7182042598724365, + "num_tokens": 336070210.0, + "step": 12987 + }, + { + "epoch": 1.4263123215462332, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.344780206680298, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7137527465820312, + "num_tokens": 336097055.0, + "step": 12988 + }, + { + "epoch": 1.426422139248847, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3193466663360596, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6989352703094482, + "num_tokens": 336124255.0, + "step": 12989 + }, + { + "epoch": 1.4265319569514605, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3574986457824707, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7075437307357788, + "num_tokens": 336151343.0, + "step": 12990 + }, + { + "epoch": 1.4266417746540743, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.153517246246338, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7074743509292603, + "num_tokens": 336183647.0, + "step": 12991 + }, + { + "epoch": 1.426751592356688, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.609123945236206, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7168091535568237, + "num_tokens": 336204677.0, + "step": 12992 + }, + { + "epoch": 1.4268614100593016, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2744739055633545, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7011808753013611, + "num_tokens": 336232008.0, + "step": 12993 + }, + { + "epoch": 1.426971227761915, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.134359359741211, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6907708644866943, + "num_tokens": 336265650.0, + "step": 12994 + }, + { + "epoch": 1.4270810454645289, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2820546627044678, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6901360154151917, + "num_tokens": 336293078.0, + "step": 12995 + }, + { + "epoch": 1.4271908631671426, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1612179279327393, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6925694346427917, + "num_tokens": 336325403.0, + "step": 12996 + }, + { + "epoch": 1.4273006808697561, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.40381121635437, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7060720920562744, + "num_tokens": 336350773.0, + "step": 12997 + }, + { + "epoch": 1.42741049857237, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4045379161834717, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7143357992172241, + "num_tokens": 336375432.0, + "step": 12998 + }, + { + "epoch": 1.4275203162749834, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2916972637176514, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7145113348960876, + "num_tokens": 336402467.0, + "step": 12999 + }, + { + "epoch": 1.4276301339775972, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4887330532073975, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7136578559875488, + "num_tokens": 336426745.0, + "step": 13000 + }, + { + "epoch": 1.427739951680211, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1220602989196777, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7100393176078796, + "num_tokens": 336456691.0, + "step": 13001 + }, + { + "epoch": 1.4278497693828245, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2984349727630615, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7286390662193298, + "num_tokens": 336481532.0, + "step": 13002 + }, + { + "epoch": 1.4279595870854382, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5510237216949463, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7398090362548828, + "num_tokens": 336502282.0, + "step": 13003 + }, + { + "epoch": 1.4280694047880518, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.183281183242798, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6959197521209717, + "num_tokens": 336536836.0, + "step": 13004 + }, + { + "epoch": 1.4281792224906655, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.505554437637329, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7236928343772888, + "num_tokens": 336559622.0, + "step": 13005 + }, + { + "epoch": 1.4282890401932793, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.274226188659668, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7217403650283813, + "num_tokens": 336586090.0, + "step": 13006 + }, + { + "epoch": 1.4283988578958928, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3885505199432373, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7170759439468384, + "num_tokens": 336610788.0, + "step": 13007 + }, + { + "epoch": 1.4285086755985064, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.2624268531799316, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7150344848632812, + "num_tokens": 336637114.0, + "step": 13008 + }, + { + "epoch": 1.4286184933011201, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4466171264648438, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7097982168197632, + "num_tokens": 336662254.0, + "step": 13009 + }, + { + "epoch": 1.4287283110037339, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1678221225738525, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6947351694107056, + "num_tokens": 336692324.0, + "step": 13010 + }, + { + "epoch": 1.4288381287063474, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.447464942932129, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6977704763412476, + "num_tokens": 336719362.0, + "step": 13011 + }, + { + "epoch": 1.4289479464089612, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.7936034202575684, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7119709849357605, + "num_tokens": 336739122.0, + "step": 13012 + }, + { + "epoch": 1.4290577641115747, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.1358389854431152, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6891943216323853, + "num_tokens": 336772121.0, + "step": 13013 + }, + { + "epoch": 1.4291675818141885, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.5000648498535156, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6977806091308594, + "num_tokens": 336796925.0, + "step": 13014 + }, + { + "epoch": 1.4292773995168022, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4932682514190674, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7049894332885742, + "num_tokens": 336821568.0, + "step": 13015 + }, + { + "epoch": 1.4293872172194158, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4622855186462402, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7063908576965332, + "num_tokens": 336847377.0, + "step": 13016 + }, + { + "epoch": 1.4294970349220295, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.361882448196411, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6830979585647583, + "num_tokens": 336872968.0, + "step": 13017 + }, + { + "epoch": 1.429606852624643, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.507458209991455, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6966079473495483, + "num_tokens": 336897484.0, + "step": 13018 + }, + { + "epoch": 1.4297166703272568, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.133357524871826, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7295520305633545, + "num_tokens": 336925818.0, + "step": 13019 + }, + { + "epoch": 1.4298264880298703, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.477785110473633, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.722906768321991, + "num_tokens": 336949216.0, + "step": 13020 + }, + { + "epoch": 1.429936305732484, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.702385663986206, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6981620192527771, + "num_tokens": 336970982.0, + "step": 13021 + }, + { + "epoch": 1.4300461234350976, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.084338665008545, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6950668096542358, + "num_tokens": 337003995.0, + "step": 13022 + }, + { + "epoch": 1.4301559411377114, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.9170660972595215, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7353350520133972, + "num_tokens": 337022070.0, + "step": 13023 + }, + { + "epoch": 1.4302657588403251, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4436450004577637, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7155804634094238, + "num_tokens": 337044754.0, + "step": 13024 + }, + { + "epoch": 1.4303755765429387, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.398369789123535, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6947454214096069, + "num_tokens": 337071397.0, + "step": 13025 + }, + { + "epoch": 1.4304853942455524, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.367365837097168, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.692581295967102, + "num_tokens": 337097440.0, + "step": 13026 + }, + { + "epoch": 1.430595211948166, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.603566884994507, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7101861238479614, + "num_tokens": 337122037.0, + "step": 13027 + }, + { + "epoch": 1.4307050296507797, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3788039684295654, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7082090973854065, + "num_tokens": 337147695.0, + "step": 13028 + }, + { + "epoch": 1.4308148473533935, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.469921350479126, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7177494168281555, + "num_tokens": 337174018.0, + "step": 13029 + }, + { + "epoch": 1.430924665056007, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.3101847171783447, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7044790983200073, + "num_tokens": 337201036.0, + "step": 13030 + }, + { + "epoch": 1.4310344827586206, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.287851095199585, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7161878347396851, + "num_tokens": 337227358.0, + "step": 13031 + }, + { + "epoch": 1.4311443004612343, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 2.4223692417144775, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7076371908187866, + "num_tokens": 337251693.0, + "step": 13032 + }, + { + "epoch": 1.431254118163848, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.365020990371704, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7157781720161438, + "num_tokens": 337278719.0, + "step": 13033 + }, + { + "epoch": 1.4313639358664616, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.242776393890381, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6854090094566345, + "num_tokens": 337310203.0, + "step": 13034 + }, + { + "epoch": 1.4314737535690754, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.544877529144287, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7101724147796631, + "num_tokens": 337332944.0, + "step": 13035 + }, + { + "epoch": 1.431583571271689, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.604755163192749, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7102057337760925, + "num_tokens": 337353080.0, + "step": 13036 + }, + { + "epoch": 1.4316933889743026, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0392589569091797, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6776641607284546, + "num_tokens": 337388005.0, + "step": 13037 + }, + { + "epoch": 1.4318032066769164, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.478999614715576, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7085149884223938, + "num_tokens": 337412486.0, + "step": 13038 + }, + { + "epoch": 1.43191302437953, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.237288475036621, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.695932924747467, + "num_tokens": 337442562.0, + "step": 13039 + }, + { + "epoch": 1.4320228420821437, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.774822473526001, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7290318012237549, + "num_tokens": 337462022.0, + "step": 13040 + }, + { + "epoch": 1.4321326597847572, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.280189037322998, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6764333844184875, + "num_tokens": 337489573.0, + "step": 13041 + }, + { + "epoch": 1.432242477487371, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3204078674316406, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6980570554733276, + "num_tokens": 337517170.0, + "step": 13042 + }, + { + "epoch": 1.4323522951899847, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1032555103302, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.704157829284668, + "num_tokens": 337550404.0, + "step": 13043 + }, + { + "epoch": 1.4324621128925983, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.381359577178955, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7155910134315491, + "num_tokens": 337573747.0, + "step": 13044 + }, + { + "epoch": 1.4325719305952118, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.599895715713501, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7203961610794067, + "num_tokens": 337596783.0, + "step": 13045 + }, + { + "epoch": 1.4326817482978256, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4995319843292236, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7145941257476807, + "num_tokens": 337619950.0, + "step": 13046 + }, + { + "epoch": 1.4327915660004393, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.086033582687378, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7279808521270752, + "num_tokens": 337650293.0, + "step": 13047 + }, + { + "epoch": 1.4329013837030529, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3092784881591797, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7098095417022705, + "num_tokens": 337680476.0, + "step": 13048 + }, + { + "epoch": 1.4330112014056666, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.548771858215332, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7105780839920044, + "num_tokens": 337702464.0, + "step": 13049 + }, + { + "epoch": 1.4331210191082802, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2791624069213867, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7088606357574463, + "num_tokens": 337729715.0, + "step": 13050 + }, + { + "epoch": 1.433230836810894, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.285630941390991, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6907968521118164, + "num_tokens": 337757375.0, + "step": 13051 + }, + { + "epoch": 1.4333406545135077, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.350857973098755, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.719002366065979, + "num_tokens": 337784525.0, + "step": 13052 + }, + { + "epoch": 1.4334504722161212, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.71281099319458, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7198099493980408, + "num_tokens": 337807150.0, + "step": 13053 + }, + { + "epoch": 1.433560289918735, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.512291669845581, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.7008364200592041, + "num_tokens": 337830400.0, + "step": 13054 + }, + { + "epoch": 1.4336701076213485, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2921035289764404, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7199125289916992, + "num_tokens": 337854411.0, + "step": 13055 + }, + { + "epoch": 1.4337799253239623, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.390026092529297, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.720899760723114, + "num_tokens": 337880853.0, + "step": 13056 + }, + { + "epoch": 1.433889743026576, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.458204984664917, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7122170925140381, + "num_tokens": 337906239.0, + "step": 13057 + }, + { + "epoch": 1.4339995607291895, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5310826301574707, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7375746965408325, + "num_tokens": 337928118.0, + "step": 13058 + }, + { + "epoch": 1.434109378431803, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3527255058288574, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.699704647064209, + "num_tokens": 337957483.0, + "step": 13059 + }, + { + "epoch": 1.4342191961344168, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.328984260559082, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7162956595420837, + "num_tokens": 337986256.0, + "step": 13060 + }, + { + "epoch": 1.4343290138370306, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.194701910018921, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7222933769226074, + "num_tokens": 338016447.0, + "step": 13061 + }, + { + "epoch": 1.4344388315396441, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6014366149902344, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6864820718765259, + "num_tokens": 338041049.0, + "step": 13062 + }, + { + "epoch": 1.434548649242258, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.237356185913086, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7315624952316284, + "num_tokens": 338069754.0, + "step": 13063 + }, + { + "epoch": 1.4346584669448714, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.592197895050049, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7213908433914185, + "num_tokens": 338090336.0, + "step": 13064 + }, + { + "epoch": 1.4347682846474852, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.260157346725464, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7231391668319702, + "num_tokens": 338118398.0, + "step": 13065 + }, + { + "epoch": 1.434878102350099, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.291400909423828, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7234373092651367, + "num_tokens": 338144635.0, + "step": 13066 + }, + { + "epoch": 1.4349879200527125, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.316525936126709, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6976990103721619, + "num_tokens": 338172175.0, + "step": 13067 + }, + { + "epoch": 1.4350977377553262, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.210376262664795, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7059375047683716, + "num_tokens": 338200321.0, + "step": 13068 + }, + { + "epoch": 1.4352075554579398, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.41302227973938, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7159790992736816, + "num_tokens": 338224147.0, + "step": 13069 + }, + { + "epoch": 1.4353173731605535, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.130953788757324, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7176964282989502, + "num_tokens": 338253884.0, + "step": 13070 + }, + { + "epoch": 1.4354271908631673, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.199298858642578, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7054396867752075, + "num_tokens": 338281900.0, + "step": 13071 + }, + { + "epoch": 1.4355370085657808, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5149953365325928, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7125561237335205, + "num_tokens": 338305899.0, + "step": 13072 + }, + { + "epoch": 1.4356468262683943, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.9121934175491333, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6996378302574158, + "num_tokens": 338344227.0, + "step": 13073 + }, + { + "epoch": 1.435756643971008, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.640947103500366, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7040548324584961, + "num_tokens": 338369871.0, + "step": 13074 + }, + { + "epoch": 1.4358664616736219, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4725685119628906, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7060295343399048, + "num_tokens": 338396378.0, + "step": 13075 + }, + { + "epoch": 1.4359762793762354, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4846181869506836, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7095073461532593, + "num_tokens": 338418917.0, + "step": 13076 + }, + { + "epoch": 1.4360860970788492, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5109925270080566, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7134212255477905, + "num_tokens": 338443366.0, + "step": 13077 + }, + { + "epoch": 1.4361959147814627, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5881612300872803, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7365501523017883, + "num_tokens": 338464714.0, + "step": 13078 + }, + { + "epoch": 1.4363057324840764, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2925665378570557, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6975133419036865, + "num_tokens": 338496171.0, + "step": 13079 + }, + { + "epoch": 1.4364155501866902, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3685052394866943, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7305667400360107, + "num_tokens": 338521962.0, + "step": 13080 + }, + { + "epoch": 1.4365253678893037, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.414254665374756, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7106248736381531, + "num_tokens": 338547278.0, + "step": 13081 + }, + { + "epoch": 1.4366351855919175, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4293293952941895, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7110412120819092, + "num_tokens": 338574124.0, + "step": 13082 + }, + { + "epoch": 1.436745003294531, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.569460868835449, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7195612192153931, + "num_tokens": 338598037.0, + "step": 13083 + }, + { + "epoch": 1.4368548209971448, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.151501178741455, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6953864097595215, + "num_tokens": 338629611.0, + "step": 13084 + }, + { + "epoch": 1.4369646386997583, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3993489742279053, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7487481832504272, + "num_tokens": 338652735.0, + "step": 13085 + }, + { + "epoch": 1.437074456402372, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6931588649749756, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7179206609725952, + "num_tokens": 338673571.0, + "step": 13086 + }, + { + "epoch": 1.4371842741049856, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.312775135040283, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6983405351638794, + "num_tokens": 338700594.0, + "step": 13087 + }, + { + "epoch": 1.4372940918075994, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7848851680755615, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7195019721984863, + "num_tokens": 338721880.0, + "step": 13088 + }, + { + "epoch": 1.4374039095102131, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.533287525177002, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7249966263771057, + "num_tokens": 338745508.0, + "step": 13089 + }, + { + "epoch": 1.4375137272128267, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6020607948303223, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7104374766349792, + "num_tokens": 338769394.0, + "step": 13090 + }, + { + "epoch": 1.4376235449154404, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2492923736572266, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7052114009857178, + "num_tokens": 338799700.0, + "step": 13091 + }, + { + "epoch": 1.437733362618054, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6203155517578125, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7090336084365845, + "num_tokens": 338823255.0, + "step": 13092 + }, + { + "epoch": 1.4378431803206677, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1461150646209717, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7020460367202759, + "num_tokens": 338855495.0, + "step": 13093 + }, + { + "epoch": 1.4379529980232815, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6561925411224365, + "learning_rate": 1e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7595829367637634, + "num_tokens": 338873672.0, + "step": 13094 + }, + { + "epoch": 1.438062815725895, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.372695207595825, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6891048550605774, + "num_tokens": 338900914.0, + "step": 13095 + }, + { + "epoch": 1.4381726334285085, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.780547618865967, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7053408622741699, + "num_tokens": 338926093.0, + "step": 13096 + }, + { + "epoch": 1.4382824511311223, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.035522699356079, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6982296705245972, + "num_tokens": 338959557.0, + "step": 13097 + }, + { + "epoch": 1.438392268833736, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1895484924316406, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6857079863548279, + "num_tokens": 338995283.0, + "step": 13098 + }, + { + "epoch": 1.4385020865363496, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5998311042785645, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6938534379005432, + "num_tokens": 339017549.0, + "step": 13099 + }, + { + "epoch": 1.4386119042389633, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3353073596954346, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7055279016494751, + "num_tokens": 339045339.0, + "step": 13100 + }, + { + "epoch": 1.4387217219415769, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.246567964553833, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7028458714485168, + "num_tokens": 339076105.0, + "step": 13101 + }, + { + "epoch": 1.4388315396441906, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.413102388381958, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6926162242889404, + "num_tokens": 339101917.0, + "step": 13102 + }, + { + "epoch": 1.4389413573468044, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.510127067565918, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7161623239517212, + "num_tokens": 339125277.0, + "step": 13103 + }, + { + "epoch": 1.439051175049418, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7834954261779785, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7297236919403076, + "num_tokens": 339142988.0, + "step": 13104 + }, + { + "epoch": 1.4391609927520317, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6289992332458496, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7026647925376892, + "num_tokens": 339164220.0, + "step": 13105 + }, + { + "epoch": 1.4392708104546452, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1874289512634277, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6993387341499329, + "num_tokens": 339194424.0, + "step": 13106 + }, + { + "epoch": 1.439380628157259, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4099273681640625, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6951043605804443, + "num_tokens": 339219591.0, + "step": 13107 + }, + { + "epoch": 1.4394904458598727, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4267637729644775, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6996538639068604, + "num_tokens": 339250645.0, + "step": 13108 + }, + { + "epoch": 1.4396002635624863, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3432559967041016, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7004883289337158, + "num_tokens": 339276743.0, + "step": 13109 + }, + { + "epoch": 1.4397100812650998, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5872862339019775, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7222140431404114, + "num_tokens": 339297691.0, + "step": 13110 + }, + { + "epoch": 1.4398198989677136, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.503333330154419, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7137428522109985, + "num_tokens": 339320684.0, + "step": 13111 + }, + { + "epoch": 1.4399297166703273, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4118032455444336, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7121236324310303, + "num_tokens": 339345478.0, + "step": 13112 + }, + { + "epoch": 1.4400395343729409, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.509546995162964, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7139964699745178, + "num_tokens": 339368782.0, + "step": 13113 + }, + { + "epoch": 1.4401493520755546, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.388786554336548, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6949703097343445, + "num_tokens": 339396266.0, + "step": 13114 + }, + { + "epoch": 1.4402591697781681, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.228567600250244, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7179422974586487, + "num_tokens": 339425471.0, + "step": 13115 + }, + { + "epoch": 1.440368987480782, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3834519386291504, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7316924333572388, + "num_tokens": 339448977.0, + "step": 13116 + }, + { + "epoch": 1.4404788051833957, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5296409130096436, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7190635204315186, + "num_tokens": 339471164.0, + "step": 13117 + }, + { + "epoch": 1.4405886228860092, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3097305297851562, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7133845090866089, + "num_tokens": 339497571.0, + "step": 13118 + }, + { + "epoch": 1.440698440588623, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.434610605239868, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7172908782958984, + "num_tokens": 339522326.0, + "step": 13119 + }, + { + "epoch": 1.4408082582912365, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4130022525787354, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7032540440559387, + "num_tokens": 339547243.0, + "step": 13120 + }, + { + "epoch": 1.4409180759938502, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4729104042053223, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7089244723320007, + "num_tokens": 339570558.0, + "step": 13121 + }, + { + "epoch": 1.441027893696464, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.495823621749878, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7308046817779541, + "num_tokens": 339592445.0, + "step": 13122 + }, + { + "epoch": 1.4411377113990775, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4687678813934326, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7011323571205139, + "num_tokens": 339617239.0, + "step": 13123 + }, + { + "epoch": 1.441247529101691, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.615656614303589, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7084802389144897, + "num_tokens": 339639633.0, + "step": 13124 + }, + { + "epoch": 1.4413573468043048, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4381589889526367, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7154446244239807, + "num_tokens": 339663745.0, + "step": 13125 + }, + { + "epoch": 1.4414671645069186, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7342567443847656, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7177894115447998, + "num_tokens": 339683723.0, + "step": 13126 + }, + { + "epoch": 1.4415769822095321, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.946643352508545, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7339708209037781, + "num_tokens": 339702235.0, + "step": 13127 + }, + { + "epoch": 1.4416867999121459, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.363900899887085, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7333272695541382, + "num_tokens": 339727218.0, + "step": 13128 + }, + { + "epoch": 1.4417966176147594, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2650134563446045, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7167255878448486, + "num_tokens": 339755696.0, + "step": 13129 + }, + { + "epoch": 1.4419064353173732, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2786262035369873, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7063171863555908, + "num_tokens": 339781899.0, + "step": 13130 + }, + { + "epoch": 1.442016253019987, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.502199411392212, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7318045496940613, + "num_tokens": 339805352.0, + "step": 13131 + }, + { + "epoch": 1.4421260707226005, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.440220832824707, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7285492420196533, + "num_tokens": 339829741.0, + "step": 13132 + }, + { + "epoch": 1.4422358884252142, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5307040214538574, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7196546196937561, + "num_tokens": 339852460.0, + "step": 13133 + }, + { + "epoch": 1.4423457061278278, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.428889274597168, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6978979110717773, + "num_tokens": 339877706.0, + "step": 13134 + }, + { + "epoch": 1.4424555238304415, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5973258018493652, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7245388031005859, + "num_tokens": 339901952.0, + "step": 13135 + }, + { + "epoch": 1.442565341533055, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.723961114883423, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7149478197097778, + "num_tokens": 339924641.0, + "step": 13136 + }, + { + "epoch": 1.4426751592356688, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1417667865753174, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.713596761226654, + "num_tokens": 339955650.0, + "step": 13137 + }, + { + "epoch": 1.4427849769382823, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4201979637145996, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7054088711738586, + "num_tokens": 339982083.0, + "step": 13138 + }, + { + "epoch": 1.442894794640896, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5545647144317627, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6860145330429077, + "num_tokens": 340003890.0, + "step": 13139 + }, + { + "epoch": 1.4430046123435099, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4031996726989746, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7064875364303589, + "num_tokens": 340028979.0, + "step": 13140 + }, + { + "epoch": 1.4431144300461234, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5242910385131836, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.711833119392395, + "num_tokens": 340051821.0, + "step": 13141 + }, + { + "epoch": 1.4432242477487371, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3671627044677734, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.721184253692627, + "num_tokens": 340077118.0, + "step": 13142 + }, + { + "epoch": 1.4433340654513507, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3550519943237305, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.698809027671814, + "num_tokens": 340103599.0, + "step": 13143 + }, + { + "epoch": 1.4434438831539644, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.506578207015991, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7296021580696106, + "num_tokens": 340125652.0, + "step": 13144 + }, + { + "epoch": 1.4435537008565782, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.418555974960327, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7197881937026978, + "num_tokens": 340150073.0, + "step": 13145 + }, + { + "epoch": 1.4436635185591917, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5423293113708496, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.725857675075531, + "num_tokens": 340172018.0, + "step": 13146 + }, + { + "epoch": 1.4437733362618055, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3778204917907715, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7030293941497803, + "num_tokens": 340197922.0, + "step": 13147 + }, + { + "epoch": 1.443883153964419, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.078908681869507, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6791418194770813, + "num_tokens": 340232043.0, + "step": 13148 + }, + { + "epoch": 1.4439929716670328, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.104680061340332, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6992039680480957, + "num_tokens": 340263950.0, + "step": 13149 + }, + { + "epoch": 1.4441027893696463, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3778810501098633, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7112863659858704, + "num_tokens": 340290039.0, + "step": 13150 + }, + { + "epoch": 1.44421260707226, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.561476230621338, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6993942856788635, + "num_tokens": 340312673.0, + "step": 13151 + }, + { + "epoch": 1.4443224247748736, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3046579360961914, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7255241870880127, + "num_tokens": 340339078.0, + "step": 13152 + }, + { + "epoch": 1.4444322424774874, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4036598205566406, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7084479331970215, + "num_tokens": 340364148.0, + "step": 13153 + }, + { + "epoch": 1.4445420601801011, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2959394454956055, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7192128896713257, + "num_tokens": 340390357.0, + "step": 13154 + }, + { + "epoch": 1.4446518778827147, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4828639030456543, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7211107015609741, + "num_tokens": 340414598.0, + "step": 13155 + }, + { + "epoch": 1.4447616955853284, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5101373195648193, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.705684244632721, + "num_tokens": 340436416.0, + "step": 13156 + }, + { + "epoch": 1.444871513287942, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.142118453979492, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.7040491104125977, + "num_tokens": 340468002.0, + "step": 13157 + }, + { + "epoch": 1.4449813309905557, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7291746139526367, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7044379711151123, + "num_tokens": 340490276.0, + "step": 13158 + }, + { + "epoch": 1.4450911486931695, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.800422191619873, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7213399410247803, + "num_tokens": 340511926.0, + "step": 13159 + }, + { + "epoch": 1.445200966395783, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.304187297821045, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7185945510864258, + "num_tokens": 340537993.0, + "step": 13160 + }, + { + "epoch": 1.4453107840983965, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4244601726531982, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7025361061096191, + "num_tokens": 340562051.0, + "step": 13161 + }, + { + "epoch": 1.4454206018010103, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.566906452178955, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.700707733631134, + "num_tokens": 340584541.0, + "step": 13162 + }, + { + "epoch": 1.445530419503624, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.529836654663086, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7236564755439758, + "num_tokens": 340607199.0, + "step": 13163 + }, + { + "epoch": 1.4456402372062376, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.446425437927246, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7076315879821777, + "num_tokens": 340631608.0, + "step": 13164 + }, + { + "epoch": 1.4457500549088513, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2662160396575928, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6928022503852844, + "num_tokens": 340659819.0, + "step": 13165 + }, + { + "epoch": 1.4458598726114649, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7989501953125, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7284239530563354, + "num_tokens": 340678889.0, + "step": 13166 + }, + { + "epoch": 1.4459696903140786, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.422391176223755, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7094537019729614, + "num_tokens": 340704348.0, + "step": 13167 + }, + { + "epoch": 1.4460795080166924, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4033279418945312, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7199342846870422, + "num_tokens": 340728379.0, + "step": 13168 + }, + { + "epoch": 1.446189325719306, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.479668617248535, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6990172863006592, + "num_tokens": 340753423.0, + "step": 13169 + }, + { + "epoch": 1.4462991434219197, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.315779447555542, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6988341212272644, + "num_tokens": 340779801.0, + "step": 13170 + }, + { + "epoch": 1.4464089611245332, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5539464950561523, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.696972668170929, + "num_tokens": 340805614.0, + "step": 13171 + }, + { + "epoch": 1.446518778827147, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0457539558410645, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7030181884765625, + "num_tokens": 340840393.0, + "step": 13172 + }, + { + "epoch": 1.4466285965297607, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1649343967437744, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6974443197250366, + "num_tokens": 340871054.0, + "step": 13173 + }, + { + "epoch": 1.4467384142323743, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4506351947784424, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6829901933670044, + "num_tokens": 340898566.0, + "step": 13174 + }, + { + "epoch": 1.4468482319349878, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.416947364807129, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7369728088378906, + "num_tokens": 340921348.0, + "step": 13175 + }, + { + "epoch": 1.4469580496376016, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2668681144714355, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7093566060066223, + "num_tokens": 340948757.0, + "step": 13176 + }, + { + "epoch": 1.4470678673402153, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.435988426208496, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7024468779563904, + "num_tokens": 340975774.0, + "step": 13177 + }, + { + "epoch": 1.4471776850428288, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6929287910461426, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7306467890739441, + "num_tokens": 340996031.0, + "step": 13178 + }, + { + "epoch": 1.4472875027454426, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.493030071258545, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.7047492265701294, + "num_tokens": 341020197.0, + "step": 13179 + }, + { + "epoch": 1.4473973204480561, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3525049686431885, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7138988971710205, + "num_tokens": 341047284.0, + "step": 13180 + }, + { + "epoch": 1.44750713815067, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5088160037994385, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6985098123550415, + "num_tokens": 341074120.0, + "step": 13181 + }, + { + "epoch": 1.4476169558532836, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.325515031814575, + "learning_rate": 1e-06, + "loss": 1.1018, + "mean_token_accuracy": 0.690877377986908, + "num_tokens": 341106844.0, + "step": 13182 + }, + { + "epoch": 1.4477267735558972, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.260530471801758, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6845588088035583, + "num_tokens": 341133552.0, + "step": 13183 + }, + { + "epoch": 1.447836591258511, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.329914093017578, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7323782444000244, + "num_tokens": 341159404.0, + "step": 13184 + }, + { + "epoch": 1.4479464089611245, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2213990688323975, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.7064220905303955, + "num_tokens": 341188828.0, + "step": 13185 + }, + { + "epoch": 1.4480562266637382, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4292495250701904, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7224842309951782, + "num_tokens": 341212905.0, + "step": 13186 + }, + { + "epoch": 1.448166044366352, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3728065490722656, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7405701875686646, + "num_tokens": 341237002.0, + "step": 13187 + }, + { + "epoch": 1.4482758620689655, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.158000946044922, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7030206918716431, + "num_tokens": 341268766.0, + "step": 13188 + }, + { + "epoch": 1.448385679771579, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.675949811935425, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7173454761505127, + "num_tokens": 341290040.0, + "step": 13189 + }, + { + "epoch": 1.4484954974741928, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0868008136749268, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.684573769569397, + "num_tokens": 341322524.0, + "step": 13190 + }, + { + "epoch": 1.4486053151768066, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.488105058670044, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6968992948532104, + "num_tokens": 341345369.0, + "step": 13191 + }, + { + "epoch": 1.44871513287942, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.425018787384033, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7099794149398804, + "num_tokens": 341370538.0, + "step": 13192 + }, + { + "epoch": 1.4488249505820339, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1936655044555664, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6984890103340149, + "num_tokens": 341402489.0, + "step": 13193 + }, + { + "epoch": 1.4489347682846474, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.399318218231201, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7105745673179626, + "num_tokens": 341428296.0, + "step": 13194 + }, + { + "epoch": 1.4490445859872612, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.408710241317749, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6942211389541626, + "num_tokens": 341455598.0, + "step": 13195 + }, + { + "epoch": 1.449154403689875, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3161137104034424, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7072063088417053, + "num_tokens": 341485051.0, + "step": 13196 + }, + { + "epoch": 1.4492642213924884, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.384563446044922, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7083501219749451, + "num_tokens": 341508687.0, + "step": 13197 + }, + { + "epoch": 1.4493740390951022, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2874650955200195, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7387274503707886, + "num_tokens": 341536130.0, + "step": 13198 + }, + { + "epoch": 1.4494838567977157, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3657000064849854, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7123353481292725, + "num_tokens": 341560952.0, + "step": 13199 + }, + { + "epoch": 1.4495936745003295, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2069132328033447, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7013112306594849, + "num_tokens": 341591777.0, + "step": 13200 + }, + { + "epoch": 1.449703492202943, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3082315921783447, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7049270868301392, + "num_tokens": 341621448.0, + "step": 13201 + }, + { + "epoch": 1.4498133099055568, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5075316429138184, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6902649402618408, + "num_tokens": 341647764.0, + "step": 13202 + }, + { + "epoch": 1.4499231276081703, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2000174522399902, + "learning_rate": 1e-06, + "loss": 1.112, + "mean_token_accuracy": 0.675030529499054, + "num_tokens": 341677722.0, + "step": 13203 + }, + { + "epoch": 1.450032945310784, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.551351547241211, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6994355916976929, + "num_tokens": 341699976.0, + "step": 13204 + }, + { + "epoch": 1.4501427630133978, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3625926971435547, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7217336893081665, + "num_tokens": 341727137.0, + "step": 13205 + }, + { + "epoch": 1.4502525807160114, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2697694301605225, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7077462673187256, + "num_tokens": 341756721.0, + "step": 13206 + }, + { + "epoch": 1.4503623984186251, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.548433303833008, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6922873258590698, + "num_tokens": 341781790.0, + "step": 13207 + }, + { + "epoch": 1.4504722161212387, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4784231185913086, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.720633864402771, + "num_tokens": 341804509.0, + "step": 13208 + }, + { + "epoch": 1.4505820338238524, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 32.605159759521484, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6954011917114258, + "num_tokens": 341828413.0, + "step": 13209 + }, + { + "epoch": 1.4506918515264662, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4674062728881836, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7233220338821411, + "num_tokens": 341851612.0, + "step": 13210 + }, + { + "epoch": 1.4508016692290797, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.558838129043579, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.745038628578186, + "num_tokens": 341873403.0, + "step": 13211 + }, + { + "epoch": 1.4509114869316933, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.232774257659912, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7127624154090881, + "num_tokens": 341903244.0, + "step": 13212 + }, + { + "epoch": 1.451021304634307, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.952854633331299, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7143715620040894, + "num_tokens": 341921413.0, + "step": 13213 + }, + { + "epoch": 1.4511311223369208, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3694682121276855, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.71773761510849, + "num_tokens": 341946752.0, + "step": 13214 + }, + { + "epoch": 1.4512409400395343, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.357668876647949, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7145488858222961, + "num_tokens": 341974994.0, + "step": 13215 + }, + { + "epoch": 1.451350757742148, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3822054862976074, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7025305032730103, + "num_tokens": 342001400.0, + "step": 13216 + }, + { + "epoch": 1.4514605754447616, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5285279750823975, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7013330459594727, + "num_tokens": 342025161.0, + "step": 13217 + }, + { + "epoch": 1.4515703931473753, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3781306743621826, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7186907529830933, + "num_tokens": 342052036.0, + "step": 13218 + }, + { + "epoch": 1.451680210849989, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.123014211654663, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7021011114120483, + "num_tokens": 342081345.0, + "step": 13219 + }, + { + "epoch": 1.4517900285526026, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5518877506256104, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.710745632648468, + "num_tokens": 342103168.0, + "step": 13220 + }, + { + "epoch": 1.4518998462552164, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.389890193939209, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.707499086856842, + "num_tokens": 342128722.0, + "step": 13221 + }, + { + "epoch": 1.45200966395783, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 8.524300575256348, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7029298543930054, + "num_tokens": 342153962.0, + "step": 13222 + }, + { + "epoch": 1.4521194816604437, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.560297966003418, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6896533966064453, + "num_tokens": 342177007.0, + "step": 13223 + }, + { + "epoch": 1.4522292993630574, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4316582679748535, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7126815319061279, + "num_tokens": 342204681.0, + "step": 13224 + }, + { + "epoch": 1.452339117065671, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.674314498901367, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.737251877784729, + "num_tokens": 342223924.0, + "step": 13225 + }, + { + "epoch": 1.4524489347682845, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3928561210632324, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7255940437316895, + "num_tokens": 342250807.0, + "step": 13226 + }, + { + "epoch": 1.4525587524708983, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.375720262527466, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7180711030960083, + "num_tokens": 342275965.0, + "step": 13227 + }, + { + "epoch": 1.452668570173512, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.171039581298828, + "learning_rate": 1e-06, + "loss": 1.109, + "mean_token_accuracy": 0.6782660484313965, + "num_tokens": 342307194.0, + "step": 13228 + }, + { + "epoch": 1.4527783878761256, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2477333545684814, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7055758833885193, + "num_tokens": 342335970.0, + "step": 13229 + }, + { + "epoch": 1.4528882055787393, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.464461088180542, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7111608982086182, + "num_tokens": 342359907.0, + "step": 13230 + }, + { + "epoch": 1.4529980232813529, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.44486403465271, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7164398431777954, + "num_tokens": 342384495.0, + "step": 13231 + }, + { + "epoch": 1.4531078409839666, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0974435806274414, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6916468143463135, + "num_tokens": 342419582.0, + "step": 13232 + }, + { + "epoch": 1.4532176586865804, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.363513469696045, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6858338713645935, + "num_tokens": 342446851.0, + "step": 13233 + }, + { + "epoch": 1.453327476389194, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4666786193847656, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7260017395019531, + "num_tokens": 342470904.0, + "step": 13234 + }, + { + "epoch": 1.4534372940918077, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5895655155181885, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.721305251121521, + "num_tokens": 342491848.0, + "step": 13235 + }, + { + "epoch": 1.4535471117944212, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1057448387145996, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.7071197628974915, + "num_tokens": 342521773.0, + "step": 13236 + }, + { + "epoch": 1.453656929497035, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.319589138031006, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6870216131210327, + "num_tokens": 342548782.0, + "step": 13237 + }, + { + "epoch": 1.4537667471996487, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1690475940704346, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.679559588432312, + "num_tokens": 342581713.0, + "step": 13238 + }, + { + "epoch": 1.4538765649022622, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.247616767883301, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.722070574760437, + "num_tokens": 342608817.0, + "step": 13239 + }, + { + "epoch": 1.4539863826048758, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.196819305419922, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7160831093788147, + "num_tokens": 342635871.0, + "step": 13240 + }, + { + "epoch": 1.4540962003074895, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.362372636795044, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7117578983306885, + "num_tokens": 342660502.0, + "step": 13241 + }, + { + "epoch": 1.4542060180101033, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.680516481399536, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.719951868057251, + "num_tokens": 342681603.0, + "step": 13242 + }, + { + "epoch": 1.4543158357127168, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2957770824432373, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.721528172492981, + "num_tokens": 342707872.0, + "step": 13243 + }, + { + "epoch": 1.4544256534153306, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3090126514434814, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6860246658325195, + "num_tokens": 342738552.0, + "step": 13244 + }, + { + "epoch": 1.4545354711179441, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.409391164779663, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.715485692024231, + "num_tokens": 342762795.0, + "step": 13245 + }, + { + "epoch": 1.4546452888205579, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4440481662750244, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7207937836647034, + "num_tokens": 342787005.0, + "step": 13246 + }, + { + "epoch": 1.4547551065231716, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6599552631378174, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7028215527534485, + "num_tokens": 342807908.0, + "step": 13247 + }, + { + "epoch": 1.4548649242257852, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2306530475616455, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6938055753707886, + "num_tokens": 342836125.0, + "step": 13248 + }, + { + "epoch": 1.454974741928399, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.221374034881592, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7246721982955933, + "num_tokens": 342868122.0, + "step": 13249 + }, + { + "epoch": 1.4550845596310125, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.299044370651245, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7187252044677734, + "num_tokens": 342895977.0, + "step": 13250 + }, + { + "epoch": 1.4551943773336262, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4154891967773438, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.697128415107727, + "num_tokens": 342921265.0, + "step": 13251 + }, + { + "epoch": 1.45530419503624, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.307666063308716, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7126444578170776, + "num_tokens": 342948013.0, + "step": 13252 + }, + { + "epoch": 1.4554140127388535, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.113297462463379, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7096819877624512, + "num_tokens": 342979030.0, + "step": 13253 + }, + { + "epoch": 1.455523830441467, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5229787826538086, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7199976444244385, + "num_tokens": 343001334.0, + "step": 13254 + }, + { + "epoch": 1.4556336481440808, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2785840034484863, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7001317739486694, + "num_tokens": 343030971.0, + "step": 13255 + }, + { + "epoch": 1.4557434658466946, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.447540521621704, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.694430947303772, + "num_tokens": 343057286.0, + "step": 13256 + }, + { + "epoch": 1.455853283549308, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.282895088195801, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6997702717781067, + "num_tokens": 343087943.0, + "step": 13257 + }, + { + "epoch": 1.4559631012519219, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5579993724823, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7208381295204163, + "num_tokens": 343110953.0, + "step": 13258 + }, + { + "epoch": 1.4560729189545354, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5374698638916016, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.71012282371521, + "num_tokens": 343133106.0, + "step": 13259 + }, + { + "epoch": 1.4561827366571491, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4890034198760986, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7180207371711731, + "num_tokens": 343157036.0, + "step": 13260 + }, + { + "epoch": 1.456292554359763, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.265488862991333, + "learning_rate": 1e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6820756196975708, + "num_tokens": 343186413.0, + "step": 13261 + }, + { + "epoch": 1.4564023720623764, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0911474227905273, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6905659437179565, + "num_tokens": 343218981.0, + "step": 13262 + }, + { + "epoch": 1.4565121897649902, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3014791011810303, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.696627676486969, + "num_tokens": 343245759.0, + "step": 13263 + }, + { + "epoch": 1.4566220074676037, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.781719923019409, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7551082372665405, + "num_tokens": 343265382.0, + "step": 13264 + }, + { + "epoch": 1.4567318251702175, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.699876546859741, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6960405707359314, + "num_tokens": 343287786.0, + "step": 13265 + }, + { + "epoch": 1.456841642872831, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4285757541656494, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7179092168807983, + "num_tokens": 343312571.0, + "step": 13266 + }, + { + "epoch": 1.4569514605754448, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.278404951095581, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6942410469055176, + "num_tokens": 343338845.0, + "step": 13267 + }, + { + "epoch": 1.4570612782780583, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2551257610321045, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.7013766765594482, + "num_tokens": 343366259.0, + "step": 13268 + }, + { + "epoch": 1.457171095980672, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4577555656433105, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.700259804725647, + "num_tokens": 343389709.0, + "step": 13269 + }, + { + "epoch": 1.4572809136832858, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.631840944290161, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7460978031158447, + "num_tokens": 343409779.0, + "step": 13270 + }, + { + "epoch": 1.4573907313858994, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5311403274536133, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7142564058303833, + "num_tokens": 343433056.0, + "step": 13271 + }, + { + "epoch": 1.4575005490885131, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.542026996612549, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7101699709892273, + "num_tokens": 343455858.0, + "step": 13272 + }, + { + "epoch": 1.4576103667911267, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7660109996795654, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7169756889343262, + "num_tokens": 343474974.0, + "step": 13273 + }, + { + "epoch": 1.4577201844937404, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6553874015808105, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7107709050178528, + "num_tokens": 343494711.0, + "step": 13274 + }, + { + "epoch": 1.4578300021963542, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.387364387512207, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7040854096412659, + "num_tokens": 343523387.0, + "step": 13275 + }, + { + "epoch": 1.4579398198989677, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6099958419799805, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7275207042694092, + "num_tokens": 343544079.0, + "step": 13276 + }, + { + "epoch": 1.4580496376015812, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.44478702545166, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.711738646030426, + "num_tokens": 343571845.0, + "step": 13277 + }, + { + "epoch": 1.458159455304195, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3626601696014404, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7171436548233032, + "num_tokens": 343596591.0, + "step": 13278 + }, + { + "epoch": 1.4582692730068088, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3008668422698975, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7205785512924194, + "num_tokens": 343622319.0, + "step": 13279 + }, + { + "epoch": 1.4583790907094223, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4881350994110107, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7120695114135742, + "num_tokens": 343645477.0, + "step": 13280 + }, + { + "epoch": 1.458488908412036, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.696509838104248, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7088838815689087, + "num_tokens": 343664693.0, + "step": 13281 + }, + { + "epoch": 1.4585987261146496, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.770143508911133, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.736342191696167, + "num_tokens": 343683758.0, + "step": 13282 + }, + { + "epoch": 1.4587085438172633, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5522820949554443, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6996347904205322, + "num_tokens": 343708334.0, + "step": 13283 + }, + { + "epoch": 1.458818361519877, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1223104000091553, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7053824067115784, + "num_tokens": 343740386.0, + "step": 13284 + }, + { + "epoch": 1.4589281792224906, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5299439430236816, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7218869924545288, + "num_tokens": 343765478.0, + "step": 13285 + }, + { + "epoch": 1.4590379969251044, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.479698419570923, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.722631573677063, + "num_tokens": 343790678.0, + "step": 13286 + }, + { + "epoch": 1.459147814627718, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.239694118499756, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7054756283760071, + "num_tokens": 343818608.0, + "step": 13287 + }, + { + "epoch": 1.4592576323303317, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2283480167388916, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7144975662231445, + "num_tokens": 343844894.0, + "step": 13288 + }, + { + "epoch": 1.4593674500329454, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.578336238861084, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7112303972244263, + "num_tokens": 343866536.0, + "step": 13289 + }, + { + "epoch": 1.459477267735559, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1518547534942627, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.709908127784729, + "num_tokens": 343894798.0, + "step": 13290 + }, + { + "epoch": 1.4595870854381725, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3372039794921875, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6944433450698853, + "num_tokens": 343921430.0, + "step": 13291 + }, + { + "epoch": 1.4596969031407863, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.272428512573242, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7243301868438721, + "num_tokens": 343949715.0, + "step": 13292 + }, + { + "epoch": 1.4598067208434, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3721845149993896, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7270644307136536, + "num_tokens": 343974416.0, + "step": 13293 + }, + { + "epoch": 1.4599165385460136, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.269089460372925, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6897034049034119, + "num_tokens": 344004519.0, + "step": 13294 + }, + { + "epoch": 1.4600263562486273, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3466763496398926, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7565428018569946, + "num_tokens": 344028304.0, + "step": 13295 + }, + { + "epoch": 1.4601361739512408, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.400883913040161, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7252768278121948, + "num_tokens": 344053256.0, + "step": 13296 + }, + { + "epoch": 1.4602459916538546, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.831408739089966, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7150564789772034, + "num_tokens": 344073480.0, + "step": 13297 + }, + { + "epoch": 1.4603558093564684, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1847381591796875, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6909864544868469, + "num_tokens": 344103404.0, + "step": 13298 + }, + { + "epoch": 1.460465627059082, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2868974208831787, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.715038537979126, + "num_tokens": 344129978.0, + "step": 13299 + }, + { + "epoch": 1.4605754447616957, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.529690980911255, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.732629656791687, + "num_tokens": 344153913.0, + "step": 13300 + }, + { + "epoch": 1.4606852624643092, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4553327560424805, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7125836610794067, + "num_tokens": 344179762.0, + "step": 13301 + }, + { + "epoch": 1.460795080166923, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5437369346618652, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7172252535820007, + "num_tokens": 344203479.0, + "step": 13302 + }, + { + "epoch": 1.4609048978695367, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5271518230438232, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7026164531707764, + "num_tokens": 344227702.0, + "step": 13303 + }, + { + "epoch": 1.4610147155721502, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.553877353668213, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7145904898643494, + "num_tokens": 344250615.0, + "step": 13304 + }, + { + "epoch": 1.4611245332747638, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2318851947784424, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6912393569946289, + "num_tokens": 344279535.0, + "step": 13305 + }, + { + "epoch": 1.4612343509773775, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0907094478607178, + "learning_rate": 1e-06, + "loss": 1.1806, + "mean_token_accuracy": 0.6590524911880493, + "num_tokens": 344315144.0, + "step": 13306 + }, + { + "epoch": 1.4613441686799913, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4981539249420166, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.706613302230835, + "num_tokens": 344338533.0, + "step": 13307 + }, + { + "epoch": 1.4614539863826048, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6376428604125977, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7015049457550049, + "num_tokens": 344364248.0, + "step": 13308 + }, + { + "epoch": 1.4615638040852186, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.628448724746704, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7196370363235474, + "num_tokens": 344386105.0, + "step": 13309 + }, + { + "epoch": 1.4616736217878321, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.663353681564331, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7140762805938721, + "num_tokens": 344407088.0, + "step": 13310 + }, + { + "epoch": 1.4617834394904459, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.616443157196045, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7050072550773621, + "num_tokens": 344428868.0, + "step": 13311 + }, + { + "epoch": 1.4618932571930596, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.476912021636963, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6953694820404053, + "num_tokens": 344454809.0, + "step": 13312 + }, + { + "epoch": 1.4620030748956732, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1423540115356445, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.683255672454834, + "num_tokens": 344487596.0, + "step": 13313 + }, + { + "epoch": 1.462112892598287, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.416992664337158, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6883329153060913, + "num_tokens": 344515855.0, + "step": 13314 + }, + { + "epoch": 1.4622227103009005, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5043351650238037, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7187566161155701, + "num_tokens": 344540736.0, + "step": 13315 + }, + { + "epoch": 1.4623325280035142, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.372150182723999, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6904892921447754, + "num_tokens": 344569990.0, + "step": 13316 + }, + { + "epoch": 1.462442345706128, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1787514686584473, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7191526889801025, + "num_tokens": 344598249.0, + "step": 13317 + }, + { + "epoch": 1.4625521634087415, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3868930339813232, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6910374760627747, + "num_tokens": 344623371.0, + "step": 13318 + }, + { + "epoch": 1.462661981111355, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.807445526123047, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.720356822013855, + "num_tokens": 344645429.0, + "step": 13319 + }, + { + "epoch": 1.4627717988139688, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3967015743255615, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7207406759262085, + "num_tokens": 344668941.0, + "step": 13320 + }, + { + "epoch": 1.4628816165165826, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.690603256225586, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7404142022132874, + "num_tokens": 344687947.0, + "step": 13321 + }, + { + "epoch": 1.462991434219196, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.41009521484375, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7348132729530334, + "num_tokens": 344714531.0, + "step": 13322 + }, + { + "epoch": 1.4631012519218098, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.065434694290161, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7131823301315308, + "num_tokens": 344745952.0, + "step": 13323 + }, + { + "epoch": 1.4632110696244234, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.476992130279541, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.71352219581604, + "num_tokens": 344769489.0, + "step": 13324 + }, + { + "epoch": 1.4633208873270371, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3612256050109863, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6784743666648865, + "num_tokens": 344798325.0, + "step": 13325 + }, + { + "epoch": 1.463430705029651, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2661566734313965, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7095767259597778, + "num_tokens": 344826816.0, + "step": 13326 + }, + { + "epoch": 1.4635405227322644, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2038652896881104, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6911863684654236, + "num_tokens": 344857874.0, + "step": 13327 + }, + { + "epoch": 1.4636503404348782, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4038290977478027, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6976169347763062, + "num_tokens": 344883472.0, + "step": 13328 + }, + { + "epoch": 1.4637601581374917, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4403762817382812, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.708628237247467, + "num_tokens": 344911717.0, + "step": 13329 + }, + { + "epoch": 1.4638699758401055, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.526423931121826, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6996911764144897, + "num_tokens": 344935917.0, + "step": 13330 + }, + { + "epoch": 1.463979793542719, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.493349075317383, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7174887657165527, + "num_tokens": 344960203.0, + "step": 13331 + }, + { + "epoch": 1.4640896112453328, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3910703659057617, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7219220399856567, + "num_tokens": 344983619.0, + "step": 13332 + }, + { + "epoch": 1.4641994289479463, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1014530658721924, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.705350399017334, + "num_tokens": 345015122.0, + "step": 13333 + }, + { + "epoch": 1.46430924665056, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2272777557373047, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7065558433532715, + "num_tokens": 345045631.0, + "step": 13334 + }, + { + "epoch": 1.4644190643531738, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.244224786758423, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7088763117790222, + "num_tokens": 345076630.0, + "step": 13335 + }, + { + "epoch": 1.4645288820557874, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2616710662841797, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7311570048332214, + "num_tokens": 345101823.0, + "step": 13336 + }, + { + "epoch": 1.464638699758401, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.357562780380249, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7102100849151611, + "num_tokens": 345128795.0, + "step": 13337 + }, + { + "epoch": 1.4647485174610146, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.320127248764038, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.707347571849823, + "num_tokens": 345156852.0, + "step": 13338 + }, + { + "epoch": 1.4648583351636284, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5589396953582764, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7192347049713135, + "num_tokens": 345182349.0, + "step": 13339 + }, + { + "epoch": 1.4649681528662422, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.248616933822632, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7247921228408813, + "num_tokens": 345210182.0, + "step": 13340 + }, + { + "epoch": 1.4650779705688557, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5978338718414307, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7445759773254395, + "num_tokens": 345230677.0, + "step": 13341 + }, + { + "epoch": 1.4651877882714692, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.429593324661255, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.71425861120224, + "num_tokens": 345256864.0, + "step": 13342 + }, + { + "epoch": 1.465297605974083, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3608415126800537, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7309103012084961, + "num_tokens": 345283478.0, + "step": 13343 + }, + { + "epoch": 1.4654074236766967, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4451279640197754, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.700928807258606, + "num_tokens": 345307144.0, + "step": 13344 + }, + { + "epoch": 1.4655172413793103, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2146804332733154, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6936209201812744, + "num_tokens": 345336359.0, + "step": 13345 + }, + { + "epoch": 1.465627059081924, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.442484140396118, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7238509654998779, + "num_tokens": 345359927.0, + "step": 13346 + }, + { + "epoch": 1.4657368767845376, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7594995498657227, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7249495983123779, + "num_tokens": 345380985.0, + "step": 13347 + }, + { + "epoch": 1.4658466944871513, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3268043994903564, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6830214858055115, + "num_tokens": 345409928.0, + "step": 13348 + }, + { + "epoch": 1.465956512189765, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4454996585845947, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7126220464706421, + "num_tokens": 345435622.0, + "step": 13349 + }, + { + "epoch": 1.4660663298923786, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6844594478607178, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7081762552261353, + "num_tokens": 345457381.0, + "step": 13350 + }, + { + "epoch": 1.4661761475949924, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5121560096740723, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7235385179519653, + "num_tokens": 345480374.0, + "step": 13351 + }, + { + "epoch": 1.466285965297606, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4877398014068604, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.744426429271698, + "num_tokens": 345505238.0, + "step": 13352 + }, + { + "epoch": 1.4663957830002197, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4763388633728027, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6884551644325256, + "num_tokens": 345531950.0, + "step": 13353 + }, + { + "epoch": 1.4665056007028334, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5507781505584717, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6873073577880859, + "num_tokens": 345556257.0, + "step": 13354 + }, + { + "epoch": 1.466615418405447, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4515318870544434, + "learning_rate": 1e-06, + "loss": 1.1246, + "mean_token_accuracy": 0.6844538450241089, + "num_tokens": 345586451.0, + "step": 13355 + }, + { + "epoch": 1.4667252361080605, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6032824516296387, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7088799476623535, + "num_tokens": 345610752.0, + "step": 13356 + }, + { + "epoch": 1.4668350538106742, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7034077644348145, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.721153974533081, + "num_tokens": 345631872.0, + "step": 13357 + }, + { + "epoch": 1.466944871513288, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.215329885482788, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6937367916107178, + "num_tokens": 345661342.0, + "step": 13358 + }, + { + "epoch": 1.4670546892159015, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.471038341522217, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7464327216148376, + "num_tokens": 345683908.0, + "step": 13359 + }, + { + "epoch": 1.4671645069185153, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4381489753723145, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6990802884101868, + "num_tokens": 345709061.0, + "step": 13360 + }, + { + "epoch": 1.4672743246211288, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5282981395721436, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7106488943099976, + "num_tokens": 345734093.0, + "step": 13361 + }, + { + "epoch": 1.4673841423237426, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3760106563568115, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6962791681289673, + "num_tokens": 345762128.0, + "step": 13362 + }, + { + "epoch": 1.4674939600263563, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4767704010009766, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7142370343208313, + "num_tokens": 345785034.0, + "step": 13363 + }, + { + "epoch": 1.4676037777289699, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4346325397491455, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7109078168869019, + "num_tokens": 345808752.0, + "step": 13364 + }, + { + "epoch": 1.4677135954315836, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.286388397216797, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.7028166651725769, + "num_tokens": 345838267.0, + "step": 13365 + }, + { + "epoch": 1.4678234131341972, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3948960304260254, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7286473512649536, + "num_tokens": 345861573.0, + "step": 13366 + }, + { + "epoch": 1.467933230836811, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.354576349258423, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7075995206832886, + "num_tokens": 345888270.0, + "step": 13367 + }, + { + "epoch": 1.4680430485394247, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.660325050354004, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7231715321540833, + "num_tokens": 345909122.0, + "step": 13368 + }, + { + "epoch": 1.4681528662420382, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.129020929336548, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6984258890151978, + "num_tokens": 345941407.0, + "step": 13369 + }, + { + "epoch": 1.4682626839446518, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3437702655792236, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7029114961624146, + "num_tokens": 345967749.0, + "step": 13370 + }, + { + "epoch": 1.4683725016472655, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7788138389587402, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7340776920318604, + "num_tokens": 345986427.0, + "step": 13371 + }, + { + "epoch": 1.4684823193498793, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.416466236114502, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7113924026489258, + "num_tokens": 346014159.0, + "step": 13372 + }, + { + "epoch": 1.4685921370524928, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.205085039138794, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7166780829429626, + "num_tokens": 346043607.0, + "step": 13373 + }, + { + "epoch": 1.4687019547551066, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.436601161956787, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7153691053390503, + "num_tokens": 346068148.0, + "step": 13374 + }, + { + "epoch": 1.46881177245772, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.61478853225708, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7099537253379822, + "num_tokens": 346089553.0, + "step": 13375 + }, + { + "epoch": 1.4689215901603339, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.421067237854004, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.7009255290031433, + "num_tokens": 346119179.0, + "step": 13376 + }, + { + "epoch": 1.4690314078629476, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6156485080718994, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7376044392585754, + "num_tokens": 346137783.0, + "step": 13377 + }, + { + "epoch": 1.4691412255655611, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3974037170410156, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7050609588623047, + "num_tokens": 346163726.0, + "step": 13378 + }, + { + "epoch": 1.469251043268175, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1994597911834717, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7022650837898254, + "num_tokens": 346192312.0, + "step": 13379 + }, + { + "epoch": 1.4693608609707884, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2736973762512207, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6955260038375854, + "num_tokens": 346221468.0, + "step": 13380 + }, + { + "epoch": 1.4694706786734022, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3344569206237793, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7087974548339844, + "num_tokens": 346247660.0, + "step": 13381 + }, + { + "epoch": 1.4695804963760157, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.364091157913208, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7054055333137512, + "num_tokens": 346275419.0, + "step": 13382 + }, + { + "epoch": 1.4696903140786295, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6496834754943848, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7202249765396118, + "num_tokens": 346299583.0, + "step": 13383 + }, + { + "epoch": 1.469800131781243, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.8172035217285156, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7263849973678589, + "num_tokens": 346319314.0, + "step": 13384 + }, + { + "epoch": 1.4699099494838568, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4349546432495117, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6915467977523804, + "num_tokens": 346344487.0, + "step": 13385 + }, + { + "epoch": 1.4700197671864705, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.345280170440674, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7193778157234192, + "num_tokens": 346371288.0, + "step": 13386 + }, + { + "epoch": 1.470129584889084, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3887453079223633, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7231295108795166, + "num_tokens": 346397949.0, + "step": 13387 + }, + { + "epoch": 1.4702394025916978, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7223827838897705, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7178781032562256, + "num_tokens": 346418085.0, + "step": 13388 + }, + { + "epoch": 1.4703492202943114, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.246328830718994, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.705447793006897, + "num_tokens": 346445774.0, + "step": 13389 + }, + { + "epoch": 1.4704590379969251, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1103429794311523, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.702591598033905, + "num_tokens": 346479092.0, + "step": 13390 + }, + { + "epoch": 1.4705688556995389, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.809561014175415, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6976516246795654, + "num_tokens": 346500975.0, + "step": 13391 + }, + { + "epoch": 1.4706786734021524, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5537848472595215, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7111036777496338, + "num_tokens": 346524780.0, + "step": 13392 + }, + { + "epoch": 1.4707884911047662, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.221522808074951, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6946501731872559, + "num_tokens": 346554229.0, + "step": 13393 + }, + { + "epoch": 1.4708983088073797, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2362747192382812, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7235649824142456, + "num_tokens": 346581471.0, + "step": 13394 + }, + { + "epoch": 1.4710081265099935, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3737597465515137, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7185121774673462, + "num_tokens": 346606506.0, + "step": 13395 + }, + { + "epoch": 1.471117944212607, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.378253698348999, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6970897912979126, + "num_tokens": 346632543.0, + "step": 13396 + }, + { + "epoch": 1.4712277619152208, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7448883056640625, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.718252420425415, + "num_tokens": 346654268.0, + "step": 13397 + }, + { + "epoch": 1.4713375796178343, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.171910524368286, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7457270622253418, + "num_tokens": 346681670.0, + "step": 13398 + }, + { + "epoch": 1.471447397320448, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3910229206085205, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7326517105102539, + "num_tokens": 346706343.0, + "step": 13399 + }, + { + "epoch": 1.4715572150230618, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4413652420043945, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7172122001647949, + "num_tokens": 346731063.0, + "step": 13400 + }, + { + "epoch": 1.4716670327256753, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4564669132232666, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7141905426979065, + "num_tokens": 346755843.0, + "step": 13401 + }, + { + "epoch": 1.471776850428289, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3798820972442627, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7025031447410583, + "num_tokens": 346782314.0, + "step": 13402 + }, + { + "epoch": 1.4718866681309026, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6050338745117188, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7278689742088318, + "num_tokens": 346804881.0, + "step": 13403 + }, + { + "epoch": 1.4719964858335164, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4060616493225098, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6855634450912476, + "num_tokens": 346833038.0, + "step": 13404 + }, + { + "epoch": 1.4721063035361301, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5179686546325684, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7296376824378967, + "num_tokens": 346854806.0, + "step": 13405 + }, + { + "epoch": 1.4722161212387437, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.162952184677124, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6931135654449463, + "num_tokens": 346886827.0, + "step": 13406 + }, + { + "epoch": 1.4723259389413572, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4753613471984863, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7107471227645874, + "num_tokens": 346909719.0, + "step": 13407 + }, + { + "epoch": 1.472435756643971, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.300295829772949, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7159198522567749, + "num_tokens": 346936087.0, + "step": 13408 + }, + { + "epoch": 1.4725455743465847, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.444058656692505, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6926181316375732, + "num_tokens": 346962253.0, + "step": 13409 + }, + { + "epoch": 1.4726553920491983, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4168615341186523, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7044371366500854, + "num_tokens": 346987083.0, + "step": 13410 + }, + { + "epoch": 1.472765209751812, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.533416748046875, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7286137342453003, + "num_tokens": 347008954.0, + "step": 13411 + }, + { + "epoch": 1.4728750274544256, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3113772869110107, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.699203610420227, + "num_tokens": 347037526.0, + "step": 13412 + }, + { + "epoch": 1.4729848451570393, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.450535297393799, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7213720083236694, + "num_tokens": 347060196.0, + "step": 13413 + }, + { + "epoch": 1.473094662859653, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.8789730072021484, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7259476184844971, + "num_tokens": 347079279.0, + "step": 13414 + }, + { + "epoch": 1.4732044805622666, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4630346298217773, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7119722962379456, + "num_tokens": 347101296.0, + "step": 13415 + }, + { + "epoch": 1.4733142982648804, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4541831016540527, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.719016432762146, + "num_tokens": 347124779.0, + "step": 13416 + }, + { + "epoch": 1.473424115967494, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.518468141555786, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7207774519920349, + "num_tokens": 347148769.0, + "step": 13417 + }, + { + "epoch": 1.4735339336701077, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2357263565063477, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7228132486343384, + "num_tokens": 347176103.0, + "step": 13418 + }, + { + "epoch": 1.4736437513727214, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.46124529838562, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7111457586288452, + "num_tokens": 347201415.0, + "step": 13419 + }, + { + "epoch": 1.473753569075335, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3013651371002197, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7283344864845276, + "num_tokens": 347228122.0, + "step": 13420 + }, + { + "epoch": 1.4738633867779485, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3319544792175293, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.710215151309967, + "num_tokens": 347254149.0, + "step": 13421 + }, + { + "epoch": 1.4739732044805622, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5217888355255127, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7202252745628357, + "num_tokens": 347278195.0, + "step": 13422 + }, + { + "epoch": 1.474083022183176, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4310660362243652, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.738256573677063, + "num_tokens": 347301112.0, + "step": 13423 + }, + { + "epoch": 1.4741928398857895, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3891472816467285, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7071079015731812, + "num_tokens": 347326955.0, + "step": 13424 + }, + { + "epoch": 1.4743026575884033, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4233012199401855, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7103019952774048, + "num_tokens": 347352680.0, + "step": 13425 + }, + { + "epoch": 1.4744124752910168, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.408642053604126, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.708671510219574, + "num_tokens": 347379559.0, + "step": 13426 + }, + { + "epoch": 1.4745222929936306, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.305696487426758, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7294660806655884, + "num_tokens": 347404330.0, + "step": 13427 + }, + { + "epoch": 1.4746321106962443, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.249619483947754, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7078069448471069, + "num_tokens": 347432882.0, + "step": 13428 + }, + { + "epoch": 1.4747419283988579, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2869317531585693, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.714057981967926, + "num_tokens": 347460572.0, + "step": 13429 + }, + { + "epoch": 1.4748517461014716, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4768154621124268, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7012149095535278, + "num_tokens": 347486650.0, + "step": 13430 + }, + { + "epoch": 1.4749615638040852, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.383383274078369, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7210135459899902, + "num_tokens": 347514758.0, + "step": 13431 + }, + { + "epoch": 1.475071381506699, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3888049125671387, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7095736265182495, + "num_tokens": 347539986.0, + "step": 13432 + }, + { + "epoch": 1.4751811992093127, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.466264247894287, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7062137126922607, + "num_tokens": 347563513.0, + "step": 13433 + }, + { + "epoch": 1.4752910169119262, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2519237995147705, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7199609279632568, + "num_tokens": 347591159.0, + "step": 13434 + }, + { + "epoch": 1.4754008346145397, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2818470001220703, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7148763537406921, + "num_tokens": 347618018.0, + "step": 13435 + }, + { + "epoch": 1.4755106523171535, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.289520740509033, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6933259963989258, + "num_tokens": 347645281.0, + "step": 13436 + }, + { + "epoch": 1.4756204700197673, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4451828002929688, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7125566005706787, + "num_tokens": 347669321.0, + "step": 13437 + }, + { + "epoch": 1.4757302877223808, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3295514583587646, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.698358416557312, + "num_tokens": 347696628.0, + "step": 13438 + }, + { + "epoch": 1.4758401054249946, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5139803886413574, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.7008782625198364, + "num_tokens": 347720666.0, + "step": 13439 + }, + { + "epoch": 1.475949923127608, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.44913911819458, + "learning_rate": 1e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.6823595762252808, + "num_tokens": 347748398.0, + "step": 13440 + }, + { + "epoch": 1.4760597408302218, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2208094596862793, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6901195645332336, + "num_tokens": 347779219.0, + "step": 13441 + }, + { + "epoch": 1.4761695585328356, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.608566999435425, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7133013606071472, + "num_tokens": 347800401.0, + "step": 13442 + }, + { + "epoch": 1.4762793762354491, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.498068332672119, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7271233797073364, + "num_tokens": 347824024.0, + "step": 13443 + }, + { + "epoch": 1.476389193938063, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3198788166046143, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7252566814422607, + "num_tokens": 347850139.0, + "step": 13444 + }, + { + "epoch": 1.4764990116406764, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3326416015625, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7093427777290344, + "num_tokens": 347875516.0, + "step": 13445 + }, + { + "epoch": 1.4766088293432902, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.150960922241211, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6903843879699707, + "num_tokens": 347906133.0, + "step": 13446 + }, + { + "epoch": 1.4767186470459037, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.184190034866333, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6866216659545898, + "num_tokens": 347938177.0, + "step": 13447 + }, + { + "epoch": 1.4768284647485175, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3513121604919434, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6960341930389404, + "num_tokens": 347964338.0, + "step": 13448 + }, + { + "epoch": 1.476938282451131, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.8061935901641846, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7142021656036377, + "num_tokens": 347983881.0, + "step": 13449 + }, + { + "epoch": 1.4770481001537448, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.30220103263855, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7064778804779053, + "num_tokens": 348011842.0, + "step": 13450 + }, + { + "epoch": 1.4771579178563585, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.464420795440674, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7132863998413086, + "num_tokens": 348037002.0, + "step": 13451 + }, + { + "epoch": 1.477267735558972, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3708419799804688, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.712570071220398, + "num_tokens": 348063768.0, + "step": 13452 + }, + { + "epoch": 1.4773775532615858, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1874797344207764, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7350330948829651, + "num_tokens": 348092857.0, + "step": 13453 + }, + { + "epoch": 1.4774873709641994, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4658427238464355, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.709314227104187, + "num_tokens": 348116796.0, + "step": 13454 + }, + { + "epoch": 1.477597188666813, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4100537300109863, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7188835144042969, + "num_tokens": 348142158.0, + "step": 13455 + }, + { + "epoch": 1.4777070063694269, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.422100782394409, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7012694478034973, + "num_tokens": 348170191.0, + "step": 13456 + }, + { + "epoch": 1.4778168240720404, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.342400312423706, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7289614677429199, + "num_tokens": 348194238.0, + "step": 13457 + }, + { + "epoch": 1.477926641774654, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0747487545013428, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7355303764343262, + "num_tokens": 348226443.0, + "step": 13458 + }, + { + "epoch": 1.4780364594772677, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3140869140625, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7085050940513611, + "num_tokens": 348253733.0, + "step": 13459 + }, + { + "epoch": 1.4781462771798815, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4226865768432617, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7087255716323853, + "num_tokens": 348278012.0, + "step": 13460 + }, + { + "epoch": 1.478256094882495, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7328970432281494, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7185549736022949, + "num_tokens": 348298231.0, + "step": 13461 + }, + { + "epoch": 1.4783659125851087, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0536108016967773, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7169787883758545, + "num_tokens": 348330234.0, + "step": 13462 + }, + { + "epoch": 1.4784757302877223, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2304961681365967, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6956815719604492, + "num_tokens": 348359195.0, + "step": 13463 + }, + { + "epoch": 1.478585547990336, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2813971042633057, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.710706889629364, + "num_tokens": 348385778.0, + "step": 13464 + }, + { + "epoch": 1.4786953656929498, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.55509090423584, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6825426816940308, + "num_tokens": 348409555.0, + "step": 13465 + }, + { + "epoch": 1.4788051833955633, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.295719861984253, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7160544395446777, + "num_tokens": 348436878.0, + "step": 13466 + }, + { + "epoch": 1.478915001098177, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4034972190856934, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7134889364242554, + "num_tokens": 348460782.0, + "step": 13467 + }, + { + "epoch": 1.4790248188007906, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.066330671310425, + "learning_rate": 1e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6799720525741577, + "num_tokens": 348493364.0, + "step": 13468 + }, + { + "epoch": 1.4791346365034044, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2015650272369385, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7186003923416138, + "num_tokens": 348522052.0, + "step": 13469 + }, + { + "epoch": 1.4792444542060181, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2988927364349365, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7042815089225769, + "num_tokens": 348550787.0, + "step": 13470 + }, + { + "epoch": 1.4793542719086317, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.615669012069702, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.739818811416626, + "num_tokens": 348569893.0, + "step": 13471 + }, + { + "epoch": 1.4794640896112452, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.747269630432129, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7310289144515991, + "num_tokens": 348589231.0, + "step": 13472 + }, + { + "epoch": 1.479573907313859, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.45902419090271, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7352355718612671, + "num_tokens": 348612204.0, + "step": 13473 + }, + { + "epoch": 1.4796837250164727, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4543850421905518, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6984992027282715, + "num_tokens": 348635785.0, + "step": 13474 + }, + { + "epoch": 1.4797935427190863, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.148803234100342, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6952463984489441, + "num_tokens": 348667896.0, + "step": 13475 + }, + { + "epoch": 1.4799033604217, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5122673511505127, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7163261771202087, + "num_tokens": 348693407.0, + "step": 13476 + }, + { + "epoch": 1.4800131781243135, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5820958614349365, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7294560670852661, + "num_tokens": 348716369.0, + "step": 13477 + }, + { + "epoch": 1.4801229958269273, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2159528732299805, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6989748477935791, + "num_tokens": 348746705.0, + "step": 13478 + }, + { + "epoch": 1.480232813529541, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.7693979740142822, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7395215034484863, + "num_tokens": 348766027.0, + "step": 13479 + }, + { + "epoch": 1.4803426312321546, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2267708778381348, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7118853330612183, + "num_tokens": 348793646.0, + "step": 13480 + }, + { + "epoch": 1.4804524489347684, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3740644454956055, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7124930620193481, + "num_tokens": 348820525.0, + "step": 13481 + }, + { + "epoch": 1.4805622666373819, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.429189443588257, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7039137482643127, + "num_tokens": 348847130.0, + "step": 13482 + }, + { + "epoch": 1.4806720843399956, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.49609637260437, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7172635197639465, + "num_tokens": 348870238.0, + "step": 13483 + }, + { + "epoch": 1.4807819020426094, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3498435020446777, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6897550225257874, + "num_tokens": 348898170.0, + "step": 13484 + }, + { + "epoch": 1.480891719745223, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5770490169525146, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6902353167533875, + "num_tokens": 348928003.0, + "step": 13485 + }, + { + "epoch": 1.4810015374478365, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.653104066848755, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7141316533088684, + "num_tokens": 348953028.0, + "step": 13486 + }, + { + "epoch": 1.4811113551504502, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.363591432571411, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7299623489379883, + "num_tokens": 348981766.0, + "step": 13487 + }, + { + "epoch": 1.481221172853064, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.232433795928955, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7022107839584351, + "num_tokens": 349010130.0, + "step": 13488 + }, + { + "epoch": 1.4813309905556775, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4082818031311035, + "learning_rate": 1e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.6862237453460693, + "num_tokens": 349037423.0, + "step": 13489 + }, + { + "epoch": 1.4814408082582913, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4686450958251953, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6951075196266174, + "num_tokens": 349063582.0, + "step": 13490 + }, + { + "epoch": 1.4815506259609048, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.152923107147217, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6984405517578125, + "num_tokens": 349097443.0, + "step": 13491 + }, + { + "epoch": 1.4816604436635186, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3840134143829346, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6959421038627625, + "num_tokens": 349124836.0, + "step": 13492 + }, + { + "epoch": 1.4817702613661323, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.588862419128418, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6918180584907532, + "num_tokens": 349150018.0, + "step": 13493 + }, + { + "epoch": 1.4818800790687459, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2236979007720947, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7098582983016968, + "num_tokens": 349178029.0, + "step": 13494 + }, + { + "epoch": 1.4819898967713596, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.561833620071411, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.697523832321167, + "num_tokens": 349201132.0, + "step": 13495 + }, + { + "epoch": 1.4820997144739732, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5888497829437256, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.722855806350708, + "num_tokens": 349222121.0, + "step": 13496 + }, + { + "epoch": 1.482209532176587, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2208871841430664, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7308861613273621, + "num_tokens": 349250114.0, + "step": 13497 + }, + { + "epoch": 1.4823193498792007, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1847894191741943, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6981189250946045, + "num_tokens": 349278670.0, + "step": 13498 + }, + { + "epoch": 1.4824291675818142, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4050042629241943, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7014271020889282, + "num_tokens": 349304750.0, + "step": 13499 + }, + { + "epoch": 1.4825389852844277, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.376849412918091, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7261078953742981, + "num_tokens": 349332424.0, + "step": 13500 + }, + { + "epoch": 1.4826488029870415, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3120124340057373, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6960489749908447, + "num_tokens": 349359251.0, + "step": 13501 + }, + { + "epoch": 1.4827586206896552, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.197873830795288, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6944129467010498, + "num_tokens": 349390802.0, + "step": 13502 + }, + { + "epoch": 1.4828684383922688, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6099963188171387, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7292343974113464, + "num_tokens": 349411462.0, + "step": 13503 + }, + { + "epoch": 1.4829782560948825, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2474586963653564, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7440719604492188, + "num_tokens": 349437804.0, + "step": 13504 + }, + { + "epoch": 1.483088073797496, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.560063600540161, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7196667790412903, + "num_tokens": 349461433.0, + "step": 13505 + }, + { + "epoch": 1.4831978915001098, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.229891777038574, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6854377388954163, + "num_tokens": 349492785.0, + "step": 13506 + }, + { + "epoch": 1.4833077092027236, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.8447425365448, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7146421670913696, + "num_tokens": 349514255.0, + "step": 13507 + }, + { + "epoch": 1.4834175269053371, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4717602729797363, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7129553556442261, + "num_tokens": 349540250.0, + "step": 13508 + }, + { + "epoch": 1.4835273446079509, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 3.084662675857544, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7261601686477661, + "num_tokens": 349555718.0, + "step": 13509 + }, + { + "epoch": 1.4836371623105644, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4921507835388184, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7153156995773315, + "num_tokens": 349581064.0, + "step": 13510 + }, + { + "epoch": 1.4837469800131782, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.311105489730835, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6947756409645081, + "num_tokens": 349610750.0, + "step": 13511 + }, + { + "epoch": 1.4838567977157917, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6420679092407227, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7120347619056702, + "num_tokens": 349631750.0, + "step": 13512 + }, + { + "epoch": 1.4839666154184055, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3635966777801514, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6961220502853394, + "num_tokens": 349660878.0, + "step": 13513 + }, + { + "epoch": 1.484076433121019, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4773502349853516, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7248781323432922, + "num_tokens": 349684386.0, + "step": 13514 + }, + { + "epoch": 1.4841862508236328, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3456740379333496, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6840428113937378, + "num_tokens": 349713907.0, + "step": 13515 + }, + { + "epoch": 1.4842960685262465, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2696173191070557, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7073773741722107, + "num_tokens": 349744778.0, + "step": 13516 + }, + { + "epoch": 1.48440588622886, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5087051391601562, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7448315620422363, + "num_tokens": 349767661.0, + "step": 13517 + }, + { + "epoch": 1.4845157039314738, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.384269952774048, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6977640986442566, + "num_tokens": 349794478.0, + "step": 13518 + }, + { + "epoch": 1.4846255216340873, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5291550159454346, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.7020566463470459, + "num_tokens": 349819844.0, + "step": 13519 + }, + { + "epoch": 1.484735339336701, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.390929937362671, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7152793407440186, + "num_tokens": 349847165.0, + "step": 13520 + }, + { + "epoch": 1.4848451570393149, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3266258239746094, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6911946535110474, + "num_tokens": 349874515.0, + "step": 13521 + }, + { + "epoch": 1.4849549747419284, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.428616762161255, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7136965990066528, + "num_tokens": 349899695.0, + "step": 13522 + }, + { + "epoch": 1.485064792444542, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1777749061584473, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.699286699295044, + "num_tokens": 349930611.0, + "step": 13523 + }, + { + "epoch": 1.4851746101471557, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2974750995635986, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6972377300262451, + "num_tokens": 349958000.0, + "step": 13524 + }, + { + "epoch": 1.4852844278497694, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3273024559020996, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7059879302978516, + "num_tokens": 349984827.0, + "step": 13525 + }, + { + "epoch": 1.485394245552383, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.498202085494995, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7109956741333008, + "num_tokens": 350008778.0, + "step": 13526 + }, + { + "epoch": 1.4855040632549967, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3157851696014404, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7096359729766846, + "num_tokens": 350034827.0, + "step": 13527 + }, + { + "epoch": 1.4856138809576103, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.2437098026275635, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.722809910774231, + "num_tokens": 350062547.0, + "step": 13528 + }, + { + "epoch": 1.485723698660224, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4245445728302, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7125537395477295, + "num_tokens": 350089143.0, + "step": 13529 + }, + { + "epoch": 1.4858335163628378, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.401439905166626, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6993765234947205, + "num_tokens": 350113252.0, + "step": 13530 + }, + { + "epoch": 1.4859433340654513, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0868594646453857, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7068030834197998, + "num_tokens": 350145085.0, + "step": 13531 + }, + { + "epoch": 1.486053151768065, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.553865909576416, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7200392484664917, + "num_tokens": 350167183.0, + "step": 13532 + }, + { + "epoch": 1.4861629694706786, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.450380563735962, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7191587090492249, + "num_tokens": 350189984.0, + "step": 13533 + }, + { + "epoch": 1.4862727871732924, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3676652908325195, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.706783652305603, + "num_tokens": 350216411.0, + "step": 13534 + }, + { + "epoch": 1.4863826048759061, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.118703603744507, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7151802182197571, + "num_tokens": 350247603.0, + "step": 13535 + }, + { + "epoch": 1.4864924225785197, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.434004545211792, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.723162829875946, + "num_tokens": 350273919.0, + "step": 13536 + }, + { + "epoch": 1.4866022402811332, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.573742389678955, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7086799144744873, + "num_tokens": 350300025.0, + "step": 13537 + }, + { + "epoch": 1.486712057983747, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6535608768463135, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7281748056411743, + "num_tokens": 350323919.0, + "step": 13538 + }, + { + "epoch": 1.4868218756863607, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.565251350402832, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7118582725524902, + "num_tokens": 350349168.0, + "step": 13539 + }, + { + "epoch": 1.4869316933889742, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.691999673843384, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7180213332176208, + "num_tokens": 350370466.0, + "step": 13540 + }, + { + "epoch": 1.487041511091588, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5778963565826416, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7214470505714417, + "num_tokens": 350392292.0, + "step": 13541 + }, + { + "epoch": 1.4871513287942015, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.406266212463379, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6852334141731262, + "num_tokens": 350418165.0, + "step": 13542 + }, + { + "epoch": 1.4872611464968153, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7070934772491455, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7105113863945007, + "num_tokens": 350438677.0, + "step": 13543 + }, + { + "epoch": 1.487370964199429, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.5087544918060303, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7083799839019775, + "num_tokens": 350463147.0, + "step": 13544 + }, + { + "epoch": 1.4874807819020426, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1567957401275635, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6947295665740967, + "num_tokens": 350493784.0, + "step": 13545 + }, + { + "epoch": 1.4875905996046563, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.6166625022888184, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7057970762252808, + "num_tokens": 350515693.0, + "step": 13546 + }, + { + "epoch": 1.4877004173072699, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.1094837188720703, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7125281095504761, + "num_tokens": 350549384.0, + "step": 13547 + }, + { + "epoch": 1.4878102350098836, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.223421573638916, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6898429989814758, + "num_tokens": 350580496.0, + "step": 13548 + }, + { + "epoch": 1.4879200527124974, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.673128128051758, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7094653844833374, + "num_tokens": 350602962.0, + "step": 13549 + }, + { + "epoch": 1.488029870415111, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.396602153778076, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7081143856048584, + "num_tokens": 350628895.0, + "step": 13550 + }, + { + "epoch": 1.4881396881177245, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.762665033340454, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7434372901916504, + "num_tokens": 350646645.0, + "step": 13551 + }, + { + "epoch": 1.4882495058203382, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.516160726547241, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6995501518249512, + "num_tokens": 350671701.0, + "step": 13552 + }, + { + "epoch": 1.488359323522952, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.0450022220611572, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7033528089523315, + "num_tokens": 350703873.0, + "step": 13553 + }, + { + "epoch": 1.4884691412255655, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.215449810028076, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.7008711099624634, + "num_tokens": 350733475.0, + "step": 13554 + }, + { + "epoch": 1.4885789589281793, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.34088397026062, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7376797199249268, + "num_tokens": 350758833.0, + "step": 13555 + }, + { + "epoch": 1.4886887766307928, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.642274856567383, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7107585072517395, + "num_tokens": 350780400.0, + "step": 13556 + }, + { + "epoch": 1.4887985943334066, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.217763662338257, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7271663546562195, + "num_tokens": 350808141.0, + "step": 13557 + }, + { + "epoch": 1.4889084120360203, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.4375290870666504, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7121317386627197, + "num_tokens": 350835002.0, + "step": 13558 + }, + { + "epoch": 1.4890182297386338, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5807249546051025, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7503870725631714, + "num_tokens": 350856990.0, + "step": 13559 + }, + { + "epoch": 1.4891280474412476, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.253889322280884, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6987873911857605, + "num_tokens": 350886291.0, + "step": 13560 + }, + { + "epoch": 1.4892378651438611, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.180428981781006, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7113140821456909, + "num_tokens": 350914005.0, + "step": 13561 + }, + { + "epoch": 1.489347682846475, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.3770861625671387, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6824948787689209, + "num_tokens": 350941507.0, + "step": 13562 + }, + { + "epoch": 1.4894575005490884, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7842442989349365, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7264127731323242, + "num_tokens": 350960208.0, + "step": 13563 + }, + { + "epoch": 1.4895673182517022, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.275759696960449, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.683158278465271, + "num_tokens": 350988155.0, + "step": 13564 + }, + { + "epoch": 1.4896771359543157, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.397404432296753, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7180882692337036, + "num_tokens": 351013036.0, + "step": 13565 + }, + { + "epoch": 1.4897869536569295, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4575393199920654, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.718521237373352, + "num_tokens": 351036133.0, + "step": 13566 + }, + { + "epoch": 1.4898967713595432, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.145064353942871, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.714685320854187, + "num_tokens": 351069401.0, + "step": 13567 + }, + { + "epoch": 1.4900065890621568, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1772780418395996, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6886172294616699, + "num_tokens": 351102336.0, + "step": 13568 + }, + { + "epoch": 1.4901164067647705, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7874233722686768, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7233924865722656, + "num_tokens": 351120748.0, + "step": 13569 + }, + { + "epoch": 1.490226224467384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.694183111190796, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7071333527565002, + "num_tokens": 351140821.0, + "step": 13570 + }, + { + "epoch": 1.4903360421699978, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1460883617401123, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6890003681182861, + "num_tokens": 351170688.0, + "step": 13571 + }, + { + "epoch": 1.4904458598726116, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.366421699523926, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.7031753659248352, + "num_tokens": 351195904.0, + "step": 13572 + }, + { + "epoch": 1.4905556775752251, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6136183738708496, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7117800712585449, + "num_tokens": 351218093.0, + "step": 13573 + }, + { + "epoch": 1.4906654952778389, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4109811782836914, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7007215619087219, + "num_tokens": 351243442.0, + "step": 13574 + }, + { + "epoch": 1.4907753129804524, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7592766284942627, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6893792152404785, + "num_tokens": 351264376.0, + "step": 13575 + }, + { + "epoch": 1.4908851306830662, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.818509340286255, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7433839440345764, + "num_tokens": 351282166.0, + "step": 13576 + }, + { + "epoch": 1.4909949483856797, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2985615730285645, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6947667598724365, + "num_tokens": 351310966.0, + "step": 13577 + }, + { + "epoch": 1.4911047660882935, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.303098201751709, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7208079099655151, + "num_tokens": 351338849.0, + "step": 13578 + }, + { + "epoch": 1.491214583790907, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.472360849380493, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7117892503738403, + "num_tokens": 351364478.0, + "step": 13579 + }, + { + "epoch": 1.4913244014935207, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4433088302612305, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7088462114334106, + "num_tokens": 351389319.0, + "step": 13580 + }, + { + "epoch": 1.4914342191961345, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.392364740371704, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.707459568977356, + "num_tokens": 351413848.0, + "step": 13581 + }, + { + "epoch": 1.491544036898748, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.125955104827881, + "learning_rate": 1e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6925890445709229, + "num_tokens": 351443166.0, + "step": 13582 + }, + { + "epoch": 1.4916538546013618, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.817986011505127, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7125953435897827, + "num_tokens": 351462274.0, + "step": 13583 + }, + { + "epoch": 1.4917636723039753, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4708313941955566, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7174879312515259, + "num_tokens": 351487010.0, + "step": 13584 + }, + { + "epoch": 1.491873490006589, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.310070753097534, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7347503304481506, + "num_tokens": 351514148.0, + "step": 13585 + }, + { + "epoch": 1.4919833077092028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.142677068710327, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6845239400863647, + "num_tokens": 351548859.0, + "step": 13586 + }, + { + "epoch": 1.4920931254118164, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6391921043395996, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7391337156295776, + "num_tokens": 351568450.0, + "step": 13587 + }, + { + "epoch": 1.49220294311443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.586808919906616, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7132304310798645, + "num_tokens": 351590179.0, + "step": 13588 + }, + { + "epoch": 1.4923127608170437, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1620125770568848, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7262650728225708, + "num_tokens": 351617740.0, + "step": 13589 + }, + { + "epoch": 1.4924225785196574, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.376357078552246, + "learning_rate": 1e-06, + "loss": 1.1203, + "mean_token_accuracy": 0.6833189725875854, + "num_tokens": 351649879.0, + "step": 13590 + }, + { + "epoch": 1.492532396222271, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7102198600769043, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7070746421813965, + "num_tokens": 351671227.0, + "step": 13591 + }, + { + "epoch": 1.4926422139248847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.548830986022949, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7088954448699951, + "num_tokens": 351693296.0, + "step": 13592 + }, + { + "epoch": 1.4927520316274983, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6978652477264404, + "learning_rate": 1e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7687433958053589, + "num_tokens": 351711924.0, + "step": 13593 + }, + { + "epoch": 1.492861849330112, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.390308141708374, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7250561714172363, + "num_tokens": 351737262.0, + "step": 13594 + }, + { + "epoch": 1.4929716670327258, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4438695907592773, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.722690224647522, + "num_tokens": 351760464.0, + "step": 13595 + }, + { + "epoch": 1.4930814847353393, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4344780445098877, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6901093125343323, + "num_tokens": 351784671.0, + "step": 13596 + }, + { + "epoch": 1.493191302437953, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3503317832946777, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6980438232421875, + "num_tokens": 351813130.0, + "step": 13597 + }, + { + "epoch": 1.4933011201405666, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.628746271133423, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.702782392501831, + "num_tokens": 351836354.0, + "step": 13598 + }, + { + "epoch": 1.4934109378431804, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2800662517547607, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7233980298042297, + "num_tokens": 351864756.0, + "step": 13599 + }, + { + "epoch": 1.493520755545794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0390751361846924, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6930798292160034, + "num_tokens": 351900574.0, + "step": 13600 + }, + { + "epoch": 1.4936305732484076, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5353052616119385, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.723762571811676, + "num_tokens": 351922710.0, + "step": 13601 + }, + { + "epoch": 1.4937403909510212, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.365938663482666, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7138184905052185, + "num_tokens": 351949225.0, + "step": 13602 + }, + { + "epoch": 1.493850208653635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.370587110519409, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7043130993843079, + "num_tokens": 351974234.0, + "step": 13603 + }, + { + "epoch": 1.4939600263562487, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.49306058883667, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7046266794204712, + "num_tokens": 351996602.0, + "step": 13604 + }, + { + "epoch": 1.4940698440588622, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.430360794067383, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7030454874038696, + "num_tokens": 352020711.0, + "step": 13605 + }, + { + "epoch": 1.494179661761476, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.279233694076538, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6847511529922485, + "num_tokens": 352052927.0, + "step": 13606 + }, + { + "epoch": 1.4942894794640895, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.45843243598938, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7087133526802063, + "num_tokens": 352077195.0, + "step": 13607 + }, + { + "epoch": 1.4943992971667033, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1758317947387695, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6955041885375977, + "num_tokens": 352106228.0, + "step": 13608 + }, + { + "epoch": 1.494509114869317, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.196598529815674, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7067786455154419, + "num_tokens": 352133965.0, + "step": 13609 + }, + { + "epoch": 1.4946189325719306, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.510505437850952, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6999581456184387, + "num_tokens": 352158760.0, + "step": 13610 + }, + { + "epoch": 1.4947287502745443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.0243427753448486, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7207167148590088, + "num_tokens": 352177506.0, + "step": 13611 + }, + { + "epoch": 1.4948385679771579, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7325966358184814, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.714690625667572, + "num_tokens": 352198833.0, + "step": 13612 + }, + { + "epoch": 1.4949483856797716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.474834442138672, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7303483486175537, + "num_tokens": 352220545.0, + "step": 13613 + }, + { + "epoch": 1.4950582033823854, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3263728618621826, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7009683847427368, + "num_tokens": 352251267.0, + "step": 13614 + }, + { + "epoch": 1.495168021084999, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.547649621963501, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6941548585891724, + "num_tokens": 352277812.0, + "step": 13615 + }, + { + "epoch": 1.4952778387876124, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4967169761657715, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.7022919654846191, + "num_tokens": 352302730.0, + "step": 13616 + }, + { + "epoch": 1.4953876564902262, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.691519021987915, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6997253894805908, + "num_tokens": 352323936.0, + "step": 13617 + }, + { + "epoch": 1.49549747419284, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.256587505340576, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7020618915557861, + "num_tokens": 352353122.0, + "step": 13618 + }, + { + "epoch": 1.4956072918954535, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5024402141571045, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7062897086143494, + "num_tokens": 352379397.0, + "step": 13619 + }, + { + "epoch": 1.4957171095980673, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3684449195861816, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7192479372024536, + "num_tokens": 352405394.0, + "step": 13620 + }, + { + "epoch": 1.4958269273006808, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5319204330444336, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7263559103012085, + "num_tokens": 352427832.0, + "step": 13621 + }, + { + "epoch": 1.4959367450032945, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4213814735412598, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7224442362785339, + "num_tokens": 352451288.0, + "step": 13622 + }, + { + "epoch": 1.4960465627059083, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1663155555725098, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.712604820728302, + "num_tokens": 352480422.0, + "step": 13623 + }, + { + "epoch": 1.4961563804085218, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6419007778167725, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7081682682037354, + "num_tokens": 352502417.0, + "step": 13624 + }, + { + "epoch": 1.4962661981111356, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.29372239112854, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6808091402053833, + "num_tokens": 352534885.0, + "step": 13625 + }, + { + "epoch": 1.4963760158137491, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1696486473083496, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7201299071311951, + "num_tokens": 352564471.0, + "step": 13626 + }, + { + "epoch": 1.4964858335163629, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6015663146972656, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6870697140693665, + "num_tokens": 352586736.0, + "step": 13627 + }, + { + "epoch": 1.4965956512189764, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3066632747650146, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.714074969291687, + "num_tokens": 352615469.0, + "step": 13628 + }, + { + "epoch": 1.4967054689215902, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2692384719848633, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7118687033653259, + "num_tokens": 352643181.0, + "step": 13629 + }, + { + "epoch": 1.4968152866242037, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.307802438735962, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.679344892501831, + "num_tokens": 352673504.0, + "step": 13630 + }, + { + "epoch": 1.4969251043268175, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3213746547698975, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.718104898929596, + "num_tokens": 352701778.0, + "step": 13631 + }, + { + "epoch": 1.4970349220294312, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3859376907348633, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7079494595527649, + "num_tokens": 352728066.0, + "step": 13632 + }, + { + "epoch": 1.4971447397320448, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.412381172180176, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7020047307014465, + "num_tokens": 352752048.0, + "step": 13633 + }, + { + "epoch": 1.4972545574346585, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1534101963043213, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6896109580993652, + "num_tokens": 352786096.0, + "step": 13634 + }, + { + "epoch": 1.497364375137272, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3241047859191895, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.727959156036377, + "num_tokens": 352812176.0, + "step": 13635 + }, + { + "epoch": 1.4974741928398858, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4637582302093506, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.713248610496521, + "num_tokens": 352835227.0, + "step": 13636 + }, + { + "epoch": 1.4975840105424996, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4212496280670166, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7117841243743896, + "num_tokens": 352861196.0, + "step": 13637 + }, + { + "epoch": 1.497693828245113, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.542721748352051, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7561710476875305, + "num_tokens": 352882435.0, + "step": 13638 + }, + { + "epoch": 1.4978036459477266, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5079257488250732, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.687521755695343, + "num_tokens": 352907753.0, + "step": 13639 + }, + { + "epoch": 1.4979134636503404, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6409363746643066, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.738826334476471, + "num_tokens": 352929689.0, + "step": 13640 + }, + { + "epoch": 1.4980232813529542, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5366342067718506, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7164466381072998, + "num_tokens": 352950689.0, + "step": 13641 + }, + { + "epoch": 1.4981330990555677, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4427194595336914, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7134495377540588, + "num_tokens": 352978013.0, + "step": 13642 + }, + { + "epoch": 1.4982429167581814, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.279273748397827, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7098050713539124, + "num_tokens": 353007336.0, + "step": 13643 + }, + { + "epoch": 1.498352734460795, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.376110553741455, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7186480760574341, + "num_tokens": 353032226.0, + "step": 13644 + }, + { + "epoch": 1.4984625521634087, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5769057273864746, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7271108031272888, + "num_tokens": 353054202.0, + "step": 13645 + }, + { + "epoch": 1.4985723698660225, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.533275604248047, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7158375978469849, + "num_tokens": 353077624.0, + "step": 13646 + }, + { + "epoch": 1.498682187568636, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3237502574920654, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7142903804779053, + "num_tokens": 353105436.0, + "step": 13647 + }, + { + "epoch": 1.4987920052712498, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3011701107025146, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6950600743293762, + "num_tokens": 353133795.0, + "step": 13648 + }, + { + "epoch": 1.4989018229738633, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6492578983306885, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7307031750679016, + "num_tokens": 353153980.0, + "step": 13649 + }, + { + "epoch": 1.499011640676477, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3461055755615234, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7159944772720337, + "num_tokens": 353180072.0, + "step": 13650 + }, + { + "epoch": 1.4991214583790908, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.338078737258911, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6961251497268677, + "num_tokens": 353209854.0, + "step": 13651 + }, + { + "epoch": 1.4992312760817044, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.883455276489258, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7199268937110901, + "num_tokens": 353227772.0, + "step": 13652 + }, + { + "epoch": 1.499341093784318, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.619410514831543, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7413538098335266, + "num_tokens": 353248130.0, + "step": 13653 + }, + { + "epoch": 1.4994509114869317, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.9900951385498047, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7076431512832642, + "num_tokens": 353268005.0, + "step": 13654 + }, + { + "epoch": 1.4995607291895454, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3430984020233154, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7280453443527222, + "num_tokens": 353293531.0, + "step": 13655 + }, + { + "epoch": 1.499670546892159, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.292154550552368, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6907899975776672, + "num_tokens": 353323734.0, + "step": 13656 + }, + { + "epoch": 1.4997803645947727, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6348793506622314, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6992623209953308, + "num_tokens": 353345463.0, + "step": 13657 + }, + { + "epoch": 1.4998901822973862, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.637242078781128, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.707109808921814, + "num_tokens": 353369808.0, + "step": 13658 + }, + { + "epoch": 1.5, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4668595790863037, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7372047901153564, + "num_tokens": 353393328.0, + "step": 13659 + }, + { + "epoch": 1.5001098177026138, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5622940063476562, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7110573053359985, + "num_tokens": 353417908.0, + "step": 13660 + }, + { + "epoch": 1.5002196354052273, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.407978057861328, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7088170647621155, + "num_tokens": 353443856.0, + "step": 13661 + }, + { + "epoch": 1.5003294531078408, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.281647205352783, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7148261070251465, + "num_tokens": 353469890.0, + "step": 13662 + }, + { + "epoch": 1.5004392708104546, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3579959869384766, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6942827105522156, + "num_tokens": 353496955.0, + "step": 13663 + }, + { + "epoch": 1.5005490885130683, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1549713611602783, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7007601261138916, + "num_tokens": 353527810.0, + "step": 13664 + }, + { + "epoch": 1.500658906215682, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3608264923095703, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6972614526748657, + "num_tokens": 353554731.0, + "step": 13665 + }, + { + "epoch": 1.5007687239182956, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.109865427017212, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6987910270690918, + "num_tokens": 353586332.0, + "step": 13666 + }, + { + "epoch": 1.5008785416209092, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2995903491973877, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6942498683929443, + "num_tokens": 353612790.0, + "step": 13667 + }, + { + "epoch": 1.500988359323523, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.219592571258545, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7085386514663696, + "num_tokens": 353640385.0, + "step": 13668 + }, + { + "epoch": 1.5010981770261367, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.419475555419922, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6931679248809814, + "num_tokens": 353665468.0, + "step": 13669 + }, + { + "epoch": 1.5012079947287504, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3344192504882812, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7258397340774536, + "num_tokens": 353690606.0, + "step": 13670 + }, + { + "epoch": 1.501317812431364, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4843311309814453, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.718498945236206, + "num_tokens": 353713806.0, + "step": 13671 + }, + { + "epoch": 1.5014276301339775, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5330605506896973, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7173852920532227, + "num_tokens": 353738924.0, + "step": 13672 + }, + { + "epoch": 1.5015374478365913, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3886890411376953, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.703476071357727, + "num_tokens": 353763454.0, + "step": 13673 + }, + { + "epoch": 1.501647265539205, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3744189739227295, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7307460308074951, + "num_tokens": 353787635.0, + "step": 13674 + }, + { + "epoch": 1.5017570832418186, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.288651704788208, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7069361209869385, + "num_tokens": 353815796.0, + "step": 13675 + }, + { + "epoch": 1.501866900944432, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3458733558654785, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7351074814796448, + "num_tokens": 353840383.0, + "step": 13676 + }, + { + "epoch": 1.5019767186470458, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.309321165084839, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7106443643569946, + "num_tokens": 353866233.0, + "step": 13677 + }, + { + "epoch": 1.5020865363496596, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.241995334625244, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7060506343841553, + "num_tokens": 353894026.0, + "step": 13678 + }, + { + "epoch": 1.5021963540522734, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.20308780670166, + "learning_rate": 1e-06, + "loss": 1.089, + "mean_token_accuracy": 0.6847426891326904, + "num_tokens": 353923991.0, + "step": 13679 + }, + { + "epoch": 1.502306171754887, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2344698905944824, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7219610810279846, + "num_tokens": 353954798.0, + "step": 13680 + }, + { + "epoch": 1.5024159894575004, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.863396644592285, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7172989249229431, + "num_tokens": 353975069.0, + "step": 13681 + }, + { + "epoch": 1.5025258071601142, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.771604537963867, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7331302165985107, + "num_tokens": 353994106.0, + "step": 13682 + }, + { + "epoch": 1.502635624862728, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.1872429847717285, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6909288167953491, + "num_tokens": 354018331.0, + "step": 13683 + }, + { + "epoch": 1.5027454425653415, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5532546043395996, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7287088632583618, + "num_tokens": 354040243.0, + "step": 13684 + }, + { + "epoch": 1.5028552602679552, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5048654079437256, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.718854546546936, + "num_tokens": 354064205.0, + "step": 13685 + }, + { + "epoch": 1.5029650779705688, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5536937713623047, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7045876979827881, + "num_tokens": 354089638.0, + "step": 13686 + }, + { + "epoch": 1.5030748956731825, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.609262228012085, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7234423160552979, + "num_tokens": 354113131.0, + "step": 13687 + }, + { + "epoch": 1.5031847133757963, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4111928939819336, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7230549454689026, + "num_tokens": 354137117.0, + "step": 13688 + }, + { + "epoch": 1.5032945310784098, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5576231479644775, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.727165937423706, + "num_tokens": 354161436.0, + "step": 13689 + }, + { + "epoch": 1.5034043487810234, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5598251819610596, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7045108079910278, + "num_tokens": 354186678.0, + "step": 13690 + }, + { + "epoch": 1.5035141664836371, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2298364639282227, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7269837856292725, + "num_tokens": 354214855.0, + "step": 13691 + }, + { + "epoch": 1.5036239841862509, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2229509353637695, + "learning_rate": 1e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6721711754798889, + "num_tokens": 354247092.0, + "step": 13692 + }, + { + "epoch": 1.5037338018888646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.620194435119629, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7281233072280884, + "num_tokens": 354268746.0, + "step": 13693 + }, + { + "epoch": 1.5038436195914782, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6502535343170166, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7148810029029846, + "num_tokens": 354296795.0, + "step": 13694 + }, + { + "epoch": 1.5039534372940917, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.240319013595581, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7185990214347839, + "num_tokens": 354325884.0, + "step": 13695 + }, + { + "epoch": 1.5040632549967055, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.453228712081909, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7205765843391418, + "num_tokens": 354348878.0, + "step": 13696 + }, + { + "epoch": 1.5041730726993192, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.491185426712036, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.736077070236206, + "num_tokens": 354373386.0, + "step": 13697 + }, + { + "epoch": 1.5042828904019327, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.350182294845581, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7123296856880188, + "num_tokens": 354399582.0, + "step": 13698 + }, + { + "epoch": 1.5043927081045465, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3503286838531494, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6761384010314941, + "num_tokens": 354427759.0, + "step": 13699 + }, + { + "epoch": 1.50450252580716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.541536808013916, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6926456689834595, + "num_tokens": 354450286.0, + "step": 13700 + }, + { + "epoch": 1.5046123435097738, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.403782844543457, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6972377300262451, + "num_tokens": 354476107.0, + "step": 13701 + }, + { + "epoch": 1.5047221612123876, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2028231620788574, + "learning_rate": 1e-06, + "loss": 1.0852, + "mean_token_accuracy": 0.6873272657394409, + "num_tokens": 354505040.0, + "step": 13702 + }, + { + "epoch": 1.504831978915001, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2529115676879883, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7052391171455383, + "num_tokens": 354534336.0, + "step": 13703 + }, + { + "epoch": 1.5049417966176146, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.513939619064331, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7092408537864685, + "num_tokens": 354559588.0, + "step": 13704 + }, + { + "epoch": 1.5050516143202284, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.54134464263916, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7052191495895386, + "num_tokens": 354583966.0, + "step": 13705 + }, + { + "epoch": 1.5051614320228421, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.469339609146118, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7150369882583618, + "num_tokens": 354608592.0, + "step": 13706 + }, + { + "epoch": 1.505271249725456, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5584864616394043, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7234609127044678, + "num_tokens": 354631381.0, + "step": 13707 + }, + { + "epoch": 1.5053810674280694, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.552849054336548, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.711331844329834, + "num_tokens": 354655687.0, + "step": 13708 + }, + { + "epoch": 1.505490885130683, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4833760261535645, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7010583877563477, + "num_tokens": 354680390.0, + "step": 13709 + }, + { + "epoch": 1.5056007028332967, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.448258399963379, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6895904541015625, + "num_tokens": 354706633.0, + "step": 13710 + }, + { + "epoch": 1.5057105205359105, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5458788871765137, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7131069898605347, + "num_tokens": 354730625.0, + "step": 13711 + }, + { + "epoch": 1.505820338238524, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.96359920501709, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7320170402526855, + "num_tokens": 354747197.0, + "step": 13712 + }, + { + "epoch": 1.5059301559411375, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.432723045349121, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7167392373085022, + "num_tokens": 354771713.0, + "step": 13713 + }, + { + "epoch": 1.5060399736437513, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.337074041366577, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7157340049743652, + "num_tokens": 354799653.0, + "step": 13714 + }, + { + "epoch": 1.506149791346365, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4465537071228027, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6985570788383484, + "num_tokens": 354827490.0, + "step": 13715 + }, + { + "epoch": 1.5062596090489788, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.627556085586548, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6998010873794556, + "num_tokens": 354854128.0, + "step": 13716 + }, + { + "epoch": 1.5063694267515924, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.47878098487854, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7058361768722534, + "num_tokens": 354879134.0, + "step": 13717 + }, + { + "epoch": 1.506479244454206, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.657545328140259, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7052291631698608, + "num_tokens": 354902971.0, + "step": 13718 + }, + { + "epoch": 1.5065890621568196, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4682559967041016, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7189886569976807, + "num_tokens": 354926622.0, + "step": 13719 + }, + { + "epoch": 1.5066988798594334, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4165570735931396, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7133337259292603, + "num_tokens": 354950837.0, + "step": 13720 + }, + { + "epoch": 1.5068086975620472, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.338148593902588, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6901962757110596, + "num_tokens": 354978412.0, + "step": 13721 + }, + { + "epoch": 1.5069185152646607, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.511151075363159, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.715726912021637, + "num_tokens": 355003622.0, + "step": 13722 + }, + { + "epoch": 1.5070283329672742, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6459085941314697, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7101283073425293, + "num_tokens": 355023549.0, + "step": 13723 + }, + { + "epoch": 1.507138150669888, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.474018096923828, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6992658376693726, + "num_tokens": 355048551.0, + "step": 13724 + }, + { + "epoch": 1.5072479683725017, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4411182403564453, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7101297974586487, + "num_tokens": 355075336.0, + "step": 13725 + }, + { + "epoch": 1.5073577860751153, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2385880947113037, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7042816877365112, + "num_tokens": 355105548.0, + "step": 13726 + }, + { + "epoch": 1.5074676037777288, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2151038646698, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7204710245132446, + "num_tokens": 355133482.0, + "step": 13727 + }, + { + "epoch": 1.5075774214803426, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3955349922180176, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7040783166885376, + "num_tokens": 355158234.0, + "step": 13728 + }, + { + "epoch": 1.5076872391829563, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3314578533172607, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7075032591819763, + "num_tokens": 355184906.0, + "step": 13729 + }, + { + "epoch": 1.50779705688557, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.633840799331665, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6892216205596924, + "num_tokens": 355206627.0, + "step": 13730 + }, + { + "epoch": 1.5079068745881836, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1503024101257324, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7029426097869873, + "num_tokens": 355236100.0, + "step": 13731 + }, + { + "epoch": 1.5080166922907972, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0872645378112793, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6837055683135986, + "num_tokens": 355270428.0, + "step": 13732 + }, + { + "epoch": 1.508126509993411, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.088510036468506, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7046595811843872, + "num_tokens": 355303500.0, + "step": 13733 + }, + { + "epoch": 1.5082363276960247, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.225151538848877, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7011781930923462, + "num_tokens": 355333276.0, + "step": 13734 + }, + { + "epoch": 1.5083461453986382, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3541183471679688, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7259358167648315, + "num_tokens": 355356832.0, + "step": 13735 + }, + { + "epoch": 1.508455963101252, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.559455156326294, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7152910828590393, + "num_tokens": 355379912.0, + "step": 13736 + }, + { + "epoch": 1.5085657808038655, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3248748779296875, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6925420761108398, + "num_tokens": 355408113.0, + "step": 13737 + }, + { + "epoch": 1.5086755985064793, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.543168067932129, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7224856615066528, + "num_tokens": 355432286.0, + "step": 13738 + }, + { + "epoch": 1.508785416209093, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4888439178466797, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6923877000808716, + "num_tokens": 355457847.0, + "step": 13739 + }, + { + "epoch": 1.5088952339117065, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4063594341278076, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7194403409957886, + "num_tokens": 355483148.0, + "step": 13740 + }, + { + "epoch": 1.50900505161432, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.29154634475708, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7206664085388184, + "num_tokens": 355510583.0, + "step": 13741 + }, + { + "epoch": 1.5091148693169338, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.427002429962158, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7355600595474243, + "num_tokens": 355535520.0, + "step": 13742 + }, + { + "epoch": 1.5092246870195476, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.23660945892334, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7130049467086792, + "num_tokens": 355562645.0, + "step": 13743 + }, + { + "epoch": 1.5093345047221614, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2818500995635986, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6995055675506592, + "num_tokens": 355590352.0, + "step": 13744 + }, + { + "epoch": 1.5094443224247749, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.49271821975708, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6949890851974487, + "num_tokens": 355616447.0, + "step": 13745 + }, + { + "epoch": 1.5095541401273884, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.295497417449951, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6989092230796814, + "num_tokens": 355643921.0, + "step": 13746 + }, + { + "epoch": 1.5096639578300022, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.679675340652466, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.726108193397522, + "num_tokens": 355663688.0, + "step": 13747 + }, + { + "epoch": 1.509773775532616, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7188432216644287, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7076461911201477, + "num_tokens": 355684287.0, + "step": 13748 + }, + { + "epoch": 1.5098835932352295, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.146343946456909, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.700035035610199, + "num_tokens": 355714256.0, + "step": 13749 + }, + { + "epoch": 1.5099934109378432, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.222446918487549, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7059592008590698, + "num_tokens": 355741462.0, + "step": 13750 + }, + { + "epoch": 1.5101032286404568, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5524322986602783, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7369445562362671, + "num_tokens": 355762296.0, + "step": 13751 + }, + { + "epoch": 1.5102130463430705, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2906320095062256, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6900993585586548, + "num_tokens": 355792887.0, + "step": 13752 + }, + { + "epoch": 1.5103228640456843, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2869582176208496, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7039318084716797, + "num_tokens": 355821379.0, + "step": 13753 + }, + { + "epoch": 1.5104326817482978, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5696511268615723, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7182560563087463, + "num_tokens": 355843447.0, + "step": 13754 + }, + { + "epoch": 1.5105424994509113, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.424504041671753, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7044109106063843, + "num_tokens": 355867998.0, + "step": 13755 + }, + { + "epoch": 1.510652317153525, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6271169185638428, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.727367103099823, + "num_tokens": 355888455.0, + "step": 13756 + }, + { + "epoch": 1.5107621348561389, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.390840768814087, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.729122519493103, + "num_tokens": 355913597.0, + "step": 13757 + }, + { + "epoch": 1.5108719525587526, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3302342891693115, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.705440878868103, + "num_tokens": 355939107.0, + "step": 13758 + }, + { + "epoch": 1.5109817702613662, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.086210012435913, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7102004289627075, + "num_tokens": 355969652.0, + "step": 13759 + }, + { + "epoch": 1.5110915879639797, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.433793783187866, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7308005094528198, + "num_tokens": 355995923.0, + "step": 13760 + }, + { + "epoch": 1.5112014056665934, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3497424125671387, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7083275318145752, + "num_tokens": 356020220.0, + "step": 13761 + }, + { + "epoch": 1.5113112233692072, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3197033405303955, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7008773684501648, + "num_tokens": 356047911.0, + "step": 13762 + }, + { + "epoch": 1.5114210410718207, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3510947227478027, + "learning_rate": 1e-06, + "loss": 1.1165, + "mean_token_accuracy": 0.6795028448104858, + "num_tokens": 356074462.0, + "step": 13763 + }, + { + "epoch": 1.5115308587744345, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.229222297668457, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7208683490753174, + "num_tokens": 356101105.0, + "step": 13764 + }, + { + "epoch": 1.511640676477048, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3309950828552246, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6923556923866272, + "num_tokens": 356128051.0, + "step": 13765 + }, + { + "epoch": 1.5117504941796618, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.498795986175537, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7182594537734985, + "num_tokens": 356151505.0, + "step": 13766 + }, + { + "epoch": 1.5118603118822755, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5116653442382812, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7131473422050476, + "num_tokens": 356177332.0, + "step": 13767 + }, + { + "epoch": 1.511970129584889, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.113874673843384, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7402098774909973, + "num_tokens": 356205591.0, + "step": 13768 + }, + { + "epoch": 1.5120799472875026, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5325021743774414, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7357112765312195, + "num_tokens": 356226094.0, + "step": 13769 + }, + { + "epoch": 1.5121897649901164, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.368980646133423, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6860224008560181, + "num_tokens": 356252556.0, + "step": 13770 + }, + { + "epoch": 1.5122995826927301, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4039812088012695, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7290788888931274, + "num_tokens": 356276592.0, + "step": 13771 + }, + { + "epoch": 1.5124094003953439, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.592813730239868, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7149377465248108, + "num_tokens": 356298457.0, + "step": 13772 + }, + { + "epoch": 1.5125192180979574, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.400191068649292, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7059983015060425, + "num_tokens": 356324337.0, + "step": 13773 + }, + { + "epoch": 1.512629035800571, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6698007583618164, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7173375487327576, + "num_tokens": 356345853.0, + "step": 13774 + }, + { + "epoch": 1.5127388535031847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2724521160125732, + "learning_rate": 1e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6761760711669922, + "num_tokens": 356379117.0, + "step": 13775 + }, + { + "epoch": 1.5128486712057985, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4503281116485596, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7065237760543823, + "num_tokens": 356405071.0, + "step": 13776 + }, + { + "epoch": 1.512958488908412, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5490920543670654, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7147977352142334, + "num_tokens": 356425984.0, + "step": 13777 + }, + { + "epoch": 1.5130683066110255, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.447307825088501, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7158751487731934, + "num_tokens": 356448982.0, + "step": 13778 + }, + { + "epoch": 1.5131781243136393, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6721761226654053, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7238045334815979, + "num_tokens": 356468840.0, + "step": 13779 + }, + { + "epoch": 1.513287942016253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.36711049079895, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6871349811553955, + "num_tokens": 356496916.0, + "step": 13780 + }, + { + "epoch": 1.5133977597188668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5690908432006836, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7333967685699463, + "num_tokens": 356519639.0, + "step": 13781 + }, + { + "epoch": 1.5135075774214803, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.518777370452881, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7058118581771851, + "num_tokens": 356543945.0, + "step": 13782 + }, + { + "epoch": 1.5136173951240939, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.331718921661377, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7021433711051941, + "num_tokens": 356571028.0, + "step": 13783 + }, + { + "epoch": 1.5137272128267076, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.232667922973633, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.714266300201416, + "num_tokens": 356600256.0, + "step": 13784 + }, + { + "epoch": 1.5138370305293214, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4751884937286377, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6950067281723022, + "num_tokens": 356625344.0, + "step": 13785 + }, + { + "epoch": 1.5139468482319351, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.300928831100464, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7184965014457703, + "num_tokens": 356650057.0, + "step": 13786 + }, + { + "epoch": 1.5140566659345487, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.453568935394287, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.710908055305481, + "num_tokens": 356675194.0, + "step": 13787 + }, + { + "epoch": 1.5141664836371622, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4995076656341553, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7244759798049927, + "num_tokens": 356698600.0, + "step": 13788 + }, + { + "epoch": 1.514276301339776, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.195829391479492, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6974549293518066, + "num_tokens": 356729849.0, + "step": 13789 + }, + { + "epoch": 1.5143861190423897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.454341173171997, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7166580557823181, + "num_tokens": 356755212.0, + "step": 13790 + }, + { + "epoch": 1.5144959367450033, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.127709150314331, + "learning_rate": 1e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.6939283609390259, + "num_tokens": 356786954.0, + "step": 13791 + }, + { + "epoch": 1.5146057544476168, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4750072956085205, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.696435809135437, + "num_tokens": 356811653.0, + "step": 13792 + }, + { + "epoch": 1.5147155721502306, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.14620304107666, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7066728472709656, + "num_tokens": 356841369.0, + "step": 13793 + }, + { + "epoch": 1.5148253898528443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.335200548171997, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6968973875045776, + "num_tokens": 356870485.0, + "step": 13794 + }, + { + "epoch": 1.514935207555458, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0679502487182617, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.69733065366745, + "num_tokens": 356904064.0, + "step": 13795 + }, + { + "epoch": 1.5150450252580716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6185221672058105, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7280049324035645, + "num_tokens": 356924167.0, + "step": 13796 + }, + { + "epoch": 1.5151548429606851, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7541637420654297, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.733672022819519, + "num_tokens": 356944442.0, + "step": 13797 + }, + { + "epoch": 1.515264660663299, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8112833499908447, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7089424133300781, + "num_tokens": 356963483.0, + "step": 13798 + }, + { + "epoch": 1.5153744783659127, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.520362377166748, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6951080560684204, + "num_tokens": 356986322.0, + "step": 13799 + }, + { + "epoch": 1.5154842960685262, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.334994316101074, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6824020743370056, + "num_tokens": 357019143.0, + "step": 13800 + }, + { + "epoch": 1.51559411377114, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.600771903991699, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7159785032272339, + "num_tokens": 357041884.0, + "step": 13801 + }, + { + "epoch": 1.5157039314737535, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1715714931488037, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6970318555831909, + "num_tokens": 357072800.0, + "step": 13802 + }, + { + "epoch": 1.5158137491763672, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.328429698944092, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7130987048149109, + "num_tokens": 357096559.0, + "step": 13803 + }, + { + "epoch": 1.515923566878981, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.645947217941284, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6899961233139038, + "num_tokens": 357118220.0, + "step": 13804 + }, + { + "epoch": 1.5160333845815945, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.280378580093384, + "learning_rate": 1e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.6751235127449036, + "num_tokens": 357150262.0, + "step": 13805 + }, + { + "epoch": 1.516143202284208, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3767216205596924, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7219376564025879, + "num_tokens": 357176008.0, + "step": 13806 + }, + { + "epoch": 1.5162530199868218, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.427236318588257, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.698407769203186, + "num_tokens": 357202813.0, + "step": 13807 + }, + { + "epoch": 1.5163628376894356, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.335995674133301, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7131520509719849, + "num_tokens": 357229265.0, + "step": 13808 + }, + { + "epoch": 1.5164726553920493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.283263683319092, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7260029315948486, + "num_tokens": 357256720.0, + "step": 13809 + }, + { + "epoch": 1.5165824730946629, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.355081558227539, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7099840641021729, + "num_tokens": 357281438.0, + "step": 13810 + }, + { + "epoch": 1.5166922907972764, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1933772563934326, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7133456468582153, + "num_tokens": 357309993.0, + "step": 13811 + }, + { + "epoch": 1.5168021084998902, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2800722122192383, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6956915855407715, + "num_tokens": 357338261.0, + "step": 13812 + }, + { + "epoch": 1.516911926202504, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3683505058288574, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7115120887756348, + "num_tokens": 357364164.0, + "step": 13813 + }, + { + "epoch": 1.5170217439051175, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.294355630874634, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7234364748001099, + "num_tokens": 357390889.0, + "step": 13814 + }, + { + "epoch": 1.5171315616077312, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3301446437835693, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.704736590385437, + "num_tokens": 357417326.0, + "step": 13815 + }, + { + "epoch": 1.5172413793103448, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.710740089416504, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7243390083312988, + "num_tokens": 357437130.0, + "step": 13816 + }, + { + "epoch": 1.5173511970129585, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3636996746063232, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7133259773254395, + "num_tokens": 357462575.0, + "step": 13817 + }, + { + "epoch": 1.5174610147155723, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.385885000228882, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.724958062171936, + "num_tokens": 357485898.0, + "step": 13818 + }, + { + "epoch": 1.5175708324181858, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.647322177886963, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7119409441947937, + "num_tokens": 357508517.0, + "step": 13819 + }, + { + "epoch": 1.5176806501207993, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3561971187591553, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7203747630119324, + "num_tokens": 357535443.0, + "step": 13820 + }, + { + "epoch": 1.517790467823413, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2499091625213623, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7070105075836182, + "num_tokens": 357565210.0, + "step": 13821 + }, + { + "epoch": 1.5179002855260268, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3914520740509033, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7024514675140381, + "num_tokens": 357589676.0, + "step": 13822 + }, + { + "epoch": 1.5180101032286406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.177014112472534, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7294206619262695, + "num_tokens": 357616481.0, + "step": 13823 + }, + { + "epoch": 1.5181199209312541, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.477585554122925, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7506076693534851, + "num_tokens": 357637844.0, + "step": 13824 + }, + { + "epoch": 1.5182297386338677, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.175995349884033, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7086001634597778, + "num_tokens": 357665617.0, + "step": 13825 + }, + { + "epoch": 1.5183395563364814, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.254551887512207, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7086017727851868, + "num_tokens": 357693346.0, + "step": 13826 + }, + { + "epoch": 1.5184493740390952, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.459977388381958, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7082198858261108, + "num_tokens": 357716822.0, + "step": 13827 + }, + { + "epoch": 1.5185591917417087, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.333493709564209, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7140582799911499, + "num_tokens": 357743016.0, + "step": 13828 + }, + { + "epoch": 1.5186690094443223, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4843430519104004, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7027292847633362, + "num_tokens": 357767385.0, + "step": 13829 + }, + { + "epoch": 1.518778827146936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3926305770874023, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.699816107749939, + "num_tokens": 357794899.0, + "step": 13830 + }, + { + "epoch": 1.5188886448495498, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.467512607574463, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7273266315460205, + "num_tokens": 357817416.0, + "step": 13831 + }, + { + "epoch": 1.5189984625521635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4034783840179443, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.697873592376709, + "num_tokens": 357843749.0, + "step": 13832 + }, + { + "epoch": 1.519108280254777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.534266710281372, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.720879316329956, + "num_tokens": 357867032.0, + "step": 13833 + }, + { + "epoch": 1.5192180979573906, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5990030765533447, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7383071184158325, + "num_tokens": 357888379.0, + "step": 13834 + }, + { + "epoch": 1.5193279156600044, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4934825897216797, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.7040780782699585, + "num_tokens": 357911911.0, + "step": 13835 + }, + { + "epoch": 1.5194377333626181, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.846689224243164, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7469615340232849, + "num_tokens": 357929136.0, + "step": 13836 + }, + { + "epoch": 1.5195475510652319, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.367422580718994, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7121224999427795, + "num_tokens": 357955890.0, + "step": 13837 + }, + { + "epoch": 1.5196573687678454, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.545652389526367, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7197432518005371, + "num_tokens": 357978799.0, + "step": 13838 + }, + { + "epoch": 1.519767186470459, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.8914361000061035, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7119779586791992, + "num_tokens": 357997097.0, + "step": 13839 + }, + { + "epoch": 1.5198770041730727, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.264657497406006, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7069365978240967, + "num_tokens": 358023895.0, + "step": 13840 + }, + { + "epoch": 1.5199868218756865, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.267575740814209, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6978927850723267, + "num_tokens": 358055221.0, + "step": 13841 + }, + { + "epoch": 1.5200966395783, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0764994621276855, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7189391851425171, + "num_tokens": 358086249.0, + "step": 13842 + }, + { + "epoch": 1.5202064572809135, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4719226360321045, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7250152826309204, + "num_tokens": 358108163.0, + "step": 13843 + }, + { + "epoch": 1.5203162749835273, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.306828737258911, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7014337182044983, + "num_tokens": 358136746.0, + "step": 13844 + }, + { + "epoch": 1.520426092686141, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.468876838684082, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7066013813018799, + "num_tokens": 358161247.0, + "step": 13845 + }, + { + "epoch": 1.5205359103887548, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5761501789093018, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6993982195854187, + "num_tokens": 358185895.0, + "step": 13846 + }, + { + "epoch": 1.5206457280913683, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.531893730163574, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7288521528244019, + "num_tokens": 358207700.0, + "step": 13847 + }, + { + "epoch": 1.5207555457939819, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5617575645446777, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7195040583610535, + "num_tokens": 358229003.0, + "step": 13848 + }, + { + "epoch": 1.5208653634965956, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2978451251983643, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7167384028434753, + "num_tokens": 358254281.0, + "step": 13849 + }, + { + "epoch": 1.5209751811992094, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.41754412651062, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6912512183189392, + "num_tokens": 358280670.0, + "step": 13850 + }, + { + "epoch": 1.5210849989018231, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1899259090423584, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.691703736782074, + "num_tokens": 358310030.0, + "step": 13851 + }, + { + "epoch": 1.5211948166044367, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.456284523010254, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6976888179779053, + "num_tokens": 358335096.0, + "step": 13852 + }, + { + "epoch": 1.5213046343070502, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.485549211502075, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.719153642654419, + "num_tokens": 358356406.0, + "step": 13853 + }, + { + "epoch": 1.521414452009664, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5994112491607666, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7258825302124023, + "num_tokens": 358377855.0, + "step": 13854 + }, + { + "epoch": 1.5215242697122777, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.34271240234375, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7333695888519287, + "num_tokens": 358405613.0, + "step": 13855 + }, + { + "epoch": 1.5216340874148913, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.246683120727539, + "learning_rate": 1e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.678259015083313, + "num_tokens": 358434728.0, + "step": 13856 + }, + { + "epoch": 1.5217439051175048, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5081515312194824, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7204899787902832, + "num_tokens": 358458558.0, + "step": 13857 + }, + { + "epoch": 1.5218537228201185, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7847342491149902, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7499474883079529, + "num_tokens": 358475837.0, + "step": 13858 + }, + { + "epoch": 1.5219635405227323, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4733285903930664, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7229262590408325, + "num_tokens": 358500531.0, + "step": 13859 + }, + { + "epoch": 1.522073358225346, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.894531488418579, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7258340120315552, + "num_tokens": 358519057.0, + "step": 13860 + }, + { + "epoch": 1.5221831759279596, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4765071868896484, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7117487788200378, + "num_tokens": 358543699.0, + "step": 13861 + }, + { + "epoch": 1.5222929936305731, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.708493709564209, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7152570486068726, + "num_tokens": 358564328.0, + "step": 13862 + }, + { + "epoch": 1.522402811333187, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5257952213287354, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7288064956665039, + "num_tokens": 358587141.0, + "step": 13863 + }, + { + "epoch": 1.5225126290358006, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2941811084747314, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7184944152832031, + "num_tokens": 358616127.0, + "step": 13864 + }, + { + "epoch": 1.5226224467384142, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5695533752441406, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7059687972068787, + "num_tokens": 358639200.0, + "step": 13865 + }, + { + "epoch": 1.522732264441028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5683627128601074, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7025513648986816, + "num_tokens": 358666300.0, + "step": 13866 + }, + { + "epoch": 1.5228420821436415, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1831068992614746, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6960036158561707, + "num_tokens": 358696624.0, + "step": 13867 + }, + { + "epoch": 1.5229518998462552, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.217435836791992, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7027000784873962, + "num_tokens": 358725844.0, + "step": 13868 + }, + { + "epoch": 1.523061717548869, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6561594009399414, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7062929272651672, + "num_tokens": 358746996.0, + "step": 13869 + }, + { + "epoch": 1.5231715352514825, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.024211883544922, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7177507877349854, + "num_tokens": 358772315.0, + "step": 13870 + }, + { + "epoch": 1.523281352954096, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.40114426612854, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7120475172996521, + "num_tokens": 358798304.0, + "step": 13871 + }, + { + "epoch": 1.5233911706567098, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3369946479797363, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7141091823577881, + "num_tokens": 358823430.0, + "step": 13872 + }, + { + "epoch": 1.5235009883593236, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.713721752166748, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.697152853012085, + "num_tokens": 358845496.0, + "step": 13873 + }, + { + "epoch": 1.5236108060619373, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2469735145568848, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7177948355674744, + "num_tokens": 358874770.0, + "step": 13874 + }, + { + "epoch": 1.5237206237645509, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3885622024536133, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7177048325538635, + "num_tokens": 358898948.0, + "step": 13875 + }, + { + "epoch": 1.5238304414671644, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2361884117126465, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7317882776260376, + "num_tokens": 358925509.0, + "step": 13876 + }, + { + "epoch": 1.5239402591697782, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.202530860900879, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7084563970565796, + "num_tokens": 358956547.0, + "step": 13877 + }, + { + "epoch": 1.524050076872392, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1909263134002686, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7154205441474915, + "num_tokens": 358985992.0, + "step": 13878 + }, + { + "epoch": 1.5241598945750054, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4024922847747803, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7082672715187073, + "num_tokens": 359011171.0, + "step": 13879 + }, + { + "epoch": 1.5242697122776192, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.887000799179077, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7409481406211853, + "num_tokens": 359029728.0, + "step": 13880 + }, + { + "epoch": 1.5243795299802327, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6365902423858643, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7013904452323914, + "num_tokens": 359051188.0, + "step": 13881 + }, + { + "epoch": 1.5244893476828465, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.378502130508423, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7132726907730103, + "num_tokens": 359075009.0, + "step": 13882 + }, + { + "epoch": 1.5245991653854603, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6632227897644043, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7350949048995972, + "num_tokens": 359096328.0, + "step": 13883 + }, + { + "epoch": 1.5247089830880738, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.527704954147339, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7245543003082275, + "num_tokens": 359120262.0, + "step": 13884 + }, + { + "epoch": 1.5248188007906873, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.726155996322632, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7156904339790344, + "num_tokens": 359140696.0, + "step": 13885 + }, + { + "epoch": 1.524928618493301, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5302023887634277, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.7026932239532471, + "num_tokens": 359164468.0, + "step": 13886 + }, + { + "epoch": 1.5250384361959148, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2735378742218018, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6947457790374756, + "num_tokens": 359195179.0, + "step": 13887 + }, + { + "epoch": 1.5251482538985286, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.437997341156006, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6857384443283081, + "num_tokens": 359222376.0, + "step": 13888 + }, + { + "epoch": 1.5252580716011421, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.336082696914673, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6874985694885254, + "num_tokens": 359251589.0, + "step": 13889 + }, + { + "epoch": 1.5253678893037557, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.129359483718872, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6944217681884766, + "num_tokens": 359283194.0, + "step": 13890 + }, + { + "epoch": 1.5254777070063694, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3575706481933594, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7253353595733643, + "num_tokens": 359306658.0, + "step": 13891 + }, + { + "epoch": 1.5255875247089832, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.178574800491333, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6880176067352295, + "num_tokens": 359338558.0, + "step": 13892 + }, + { + "epoch": 1.5256973424115967, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.306598424911499, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7143468856811523, + "num_tokens": 359365423.0, + "step": 13893 + }, + { + "epoch": 1.5258071601142102, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7566516399383545, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6939882636070251, + "num_tokens": 359387069.0, + "step": 13894 + }, + { + "epoch": 1.525916977816824, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3958091735839844, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7049117088317871, + "num_tokens": 359412170.0, + "step": 13895 + }, + { + "epoch": 1.5260267955194378, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.225940227508545, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6867809295654297, + "num_tokens": 359441555.0, + "step": 13896 + }, + { + "epoch": 1.5261366132220515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.597248077392578, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7370504140853882, + "num_tokens": 359463418.0, + "step": 13897 + }, + { + "epoch": 1.526246430924665, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.160306692123413, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7169582843780518, + "num_tokens": 359493384.0, + "step": 13898 + }, + { + "epoch": 1.5263562486272786, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.257523775100708, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7164019346237183, + "num_tokens": 359525207.0, + "step": 13899 + }, + { + "epoch": 1.5264660663298923, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2403578758239746, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.711347222328186, + "num_tokens": 359554399.0, + "step": 13900 + }, + { + "epoch": 1.526575884032506, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.626110792160034, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7182513475418091, + "num_tokens": 359574869.0, + "step": 13901 + }, + { + "epoch": 1.5266857017351199, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.52097749710083, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7100256681442261, + "num_tokens": 359598923.0, + "step": 13902 + }, + { + "epoch": 1.5267955194377334, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.225813627243042, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6964794397354126, + "num_tokens": 359633756.0, + "step": 13903 + }, + { + "epoch": 1.526905337140347, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.599730968475342, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7120950222015381, + "num_tokens": 359655317.0, + "step": 13904 + }, + { + "epoch": 1.5270151548429607, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4872498512268066, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6819275617599487, + "num_tokens": 359679187.0, + "step": 13905 + }, + { + "epoch": 1.5271249725455744, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5405113697052, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.714598536491394, + "num_tokens": 359701225.0, + "step": 13906 + }, + { + "epoch": 1.527234790248188, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.230158567428589, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7157975435256958, + "num_tokens": 359731034.0, + "step": 13907 + }, + { + "epoch": 1.5273446079508015, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.233189344406128, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6952784061431885, + "num_tokens": 359759267.0, + "step": 13908 + }, + { + "epoch": 1.5274544256534153, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.38730525970459, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7150258421897888, + "num_tokens": 359783366.0, + "step": 13909 + }, + { + "epoch": 1.527564243356029, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.228661298751831, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6902260184288025, + "num_tokens": 359813623.0, + "step": 13910 + }, + { + "epoch": 1.5276740610586428, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2413439750671387, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7312346696853638, + "num_tokens": 359839622.0, + "step": 13911 + }, + { + "epoch": 1.5277838787612563, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6761586666107178, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7102435231208801, + "num_tokens": 359861511.0, + "step": 13912 + }, + { + "epoch": 1.5278936964638699, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6459579467773438, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7021079063415527, + "num_tokens": 359882319.0, + "step": 13913 + }, + { + "epoch": 1.5280035141664836, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4045541286468506, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7053753733634949, + "num_tokens": 359907346.0, + "step": 13914 + }, + { + "epoch": 1.5281133318690974, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2042624950408936, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7175211906433105, + "num_tokens": 359937308.0, + "step": 13915 + }, + { + "epoch": 1.5282231495717111, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1370248794555664, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7125627994537354, + "num_tokens": 359969915.0, + "step": 13916 + }, + { + "epoch": 1.5283329672743247, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.54771089553833, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7132872343063354, + "num_tokens": 359992266.0, + "step": 13917 + }, + { + "epoch": 1.5284427849769382, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5672988891601562, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7216160297393799, + "num_tokens": 360014096.0, + "step": 13918 + }, + { + "epoch": 1.528552602679552, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3140146732330322, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6903367042541504, + "num_tokens": 360044076.0, + "step": 13919 + }, + { + "epoch": 1.5286624203821657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.243263006210327, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7109569311141968, + "num_tokens": 360073970.0, + "step": 13920 + }, + { + "epoch": 1.5287722380847792, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3282437324523926, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7163318395614624, + "num_tokens": 360099803.0, + "step": 13921 + }, + { + "epoch": 1.5288820557873928, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.39717960357666, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7068585157394409, + "num_tokens": 360124665.0, + "step": 13922 + }, + { + "epoch": 1.5289918734900065, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.19332218170166, + "learning_rate": 1e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.68818199634552, + "num_tokens": 360154624.0, + "step": 13923 + }, + { + "epoch": 1.5291016911926203, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.270989179611206, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.718011736869812, + "num_tokens": 360181063.0, + "step": 13924 + }, + { + "epoch": 1.529211508895234, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.206895351409912, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.7020132541656494, + "num_tokens": 360209406.0, + "step": 13925 + }, + { + "epoch": 1.5293213265978476, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5287835597991943, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7276203632354736, + "num_tokens": 360231664.0, + "step": 13926 + }, + { + "epoch": 1.5294311443004611, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.221076250076294, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7149165272712708, + "num_tokens": 360259971.0, + "step": 13927 + }, + { + "epoch": 1.5295409620030749, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5133347511291504, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.723724365234375, + "num_tokens": 360283538.0, + "step": 13928 + }, + { + "epoch": 1.5296507797056886, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.272962808609009, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6930738091468811, + "num_tokens": 360311061.0, + "step": 13929 + }, + { + "epoch": 1.5297605974083022, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.024796962738037, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7307981252670288, + "num_tokens": 360327539.0, + "step": 13930 + }, + { + "epoch": 1.529870415110916, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.407703399658203, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.706876277923584, + "num_tokens": 360352858.0, + "step": 13931 + }, + { + "epoch": 1.5299802328135295, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2562434673309326, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7054436206817627, + "num_tokens": 360383213.0, + "step": 13932 + }, + { + "epoch": 1.5300900505161432, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.298671007156372, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.7009958028793335, + "num_tokens": 360411710.0, + "step": 13933 + }, + { + "epoch": 1.530199868218757, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.589277982711792, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7006583213806152, + "num_tokens": 360434777.0, + "step": 13934 + }, + { + "epoch": 1.5303096859213705, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2053396701812744, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6827529668807983, + "num_tokens": 360467550.0, + "step": 13935 + }, + { + "epoch": 1.530419503623984, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3791818618774414, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6945540904998779, + "num_tokens": 360494011.0, + "step": 13936 + }, + { + "epoch": 1.5305293213265978, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.25350022315979, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7108587026596069, + "num_tokens": 360521569.0, + "step": 13937 + }, + { + "epoch": 1.5306391390292116, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4227945804595947, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7121909260749817, + "num_tokens": 360548911.0, + "step": 13938 + }, + { + "epoch": 1.5307489567318253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.026113033294678, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7005647420883179, + "num_tokens": 360574672.0, + "step": 13939 + }, + { + "epoch": 1.5308587744344389, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4452266693115234, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.714148759841919, + "num_tokens": 360599959.0, + "step": 13940 + }, + { + "epoch": 1.5309685921370524, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.347353219985962, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.709047794342041, + "num_tokens": 360626176.0, + "step": 13941 + }, + { + "epoch": 1.5310784098396661, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3385045528411865, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.718306839466095, + "num_tokens": 360650766.0, + "step": 13942 + }, + { + "epoch": 1.53118822754228, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.445244550704956, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7013210654258728, + "num_tokens": 360674516.0, + "step": 13943 + }, + { + "epoch": 1.5312980452448934, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.13488507270813, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.701691746711731, + "num_tokens": 360702725.0, + "step": 13944 + }, + { + "epoch": 1.5314078629475072, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.636594533920288, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6973759531974792, + "num_tokens": 360727098.0, + "step": 13945 + }, + { + "epoch": 1.5315176806501207, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.332317590713501, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7109664082527161, + "num_tokens": 360755180.0, + "step": 13946 + }, + { + "epoch": 1.5316274983527345, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4793808460235596, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7250343561172485, + "num_tokens": 360779517.0, + "step": 13947 + }, + { + "epoch": 1.5317373160553482, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6147212982177734, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7075539231300354, + "num_tokens": 360799708.0, + "step": 13948 + }, + { + "epoch": 1.5318471337579618, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.476318597793579, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6940069794654846, + "num_tokens": 360825254.0, + "step": 13949 + }, + { + "epoch": 1.5319569514605753, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6982808113098145, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7081254124641418, + "num_tokens": 360844500.0, + "step": 13950 + }, + { + "epoch": 1.532066769163189, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7985501289367676, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.702485203742981, + "num_tokens": 360864445.0, + "step": 13951 + }, + { + "epoch": 1.5321765868658028, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4980127811431885, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7346182465553284, + "num_tokens": 360887670.0, + "step": 13952 + }, + { + "epoch": 1.5322864045684166, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4575555324554443, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7300904989242554, + "num_tokens": 360911609.0, + "step": 13953 + }, + { + "epoch": 1.5323962222710301, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6056675910949707, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7102602124214172, + "num_tokens": 360935516.0, + "step": 13954 + }, + { + "epoch": 1.5325060399736437, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8413784503936768, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.7039141654968262, + "num_tokens": 360957172.0, + "step": 13955 + }, + { + "epoch": 1.5326158576762574, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.062586307525635, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7326018810272217, + "num_tokens": 360984712.0, + "step": 13956 + }, + { + "epoch": 1.5327256753788712, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.368239164352417, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.688486635684967, + "num_tokens": 361013040.0, + "step": 13957 + }, + { + "epoch": 1.5328354930814847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2302699089050293, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6912925243377686, + "num_tokens": 361045049.0, + "step": 13958 + }, + { + "epoch": 1.5329453107840982, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.343430757522583, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6847899556159973, + "num_tokens": 361074053.0, + "step": 13959 + }, + { + "epoch": 1.533055128486712, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3038489818573, + "learning_rate": 1e-06, + "loss": 1.097, + "mean_token_accuracy": 0.6778567433357239, + "num_tokens": 361104096.0, + "step": 13960 + }, + { + "epoch": 1.5331649461893258, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4642653465270996, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7132617235183716, + "num_tokens": 361127413.0, + "step": 13961 + }, + { + "epoch": 1.5332747638919395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.17051100730896, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7028793096542358, + "num_tokens": 361156264.0, + "step": 13962 + }, + { + "epoch": 1.533384581594553, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.603811740875244, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7069567441940308, + "num_tokens": 361179506.0, + "step": 13963 + }, + { + "epoch": 1.5334943992971666, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.165642738342285, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.701725959777832, + "num_tokens": 361208503.0, + "step": 13964 + }, + { + "epoch": 1.5336042169997803, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.26816725730896, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7006270885467529, + "num_tokens": 361236807.0, + "step": 13965 + }, + { + "epoch": 1.533714034702394, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.372678279876709, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.734411358833313, + "num_tokens": 361260891.0, + "step": 13966 + }, + { + "epoch": 1.5338238524050078, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6542577743530273, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7433135509490967, + "num_tokens": 361280470.0, + "step": 13967 + }, + { + "epoch": 1.5339336701076214, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.204767942428589, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7061694264411926, + "num_tokens": 361308667.0, + "step": 13968 + }, + { + "epoch": 1.534043487810235, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.383183240890503, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.719536542892456, + "num_tokens": 361333878.0, + "step": 13969 + }, + { + "epoch": 1.5341533055128487, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6422536373138428, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6941311955451965, + "num_tokens": 361360833.0, + "step": 13970 + }, + { + "epoch": 1.5342631232154624, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.309877634048462, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7031480669975281, + "num_tokens": 361388075.0, + "step": 13971 + }, + { + "epoch": 1.534372940918076, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4978585243225098, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7169294357299805, + "num_tokens": 361413148.0, + "step": 13972 + }, + { + "epoch": 1.5344827586206895, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.402022123336792, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7022882103919983, + "num_tokens": 361441051.0, + "step": 13973 + }, + { + "epoch": 1.5345925763233033, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5606296062469482, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7179100513458252, + "num_tokens": 361464033.0, + "step": 13974 + }, + { + "epoch": 1.534702394025917, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.336005210876465, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7168469429016113, + "num_tokens": 361490130.0, + "step": 13975 + }, + { + "epoch": 1.5348122117285308, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.239441156387329, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7261385917663574, + "num_tokens": 361516786.0, + "step": 13976 + }, + { + "epoch": 1.5349220294311443, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4449284076690674, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7240957021713257, + "num_tokens": 361541102.0, + "step": 13977 + }, + { + "epoch": 1.5350318471337578, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5780742168426514, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6956919431686401, + "num_tokens": 361564405.0, + "step": 13978 + }, + { + "epoch": 1.5351416648363716, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.539684772491455, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7271149158477783, + "num_tokens": 361587221.0, + "step": 13979 + }, + { + "epoch": 1.5352514825389854, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3147928714752197, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6805075407028198, + "num_tokens": 361617470.0, + "step": 13980 + }, + { + "epoch": 1.535361300241599, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.552548885345459, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6992690563201904, + "num_tokens": 361641579.0, + "step": 13981 + }, + { + "epoch": 1.5354711179442126, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.84716534614563, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7106376886367798, + "num_tokens": 361660463.0, + "step": 13982 + }, + { + "epoch": 1.5355809356468262, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3077478408813477, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7074187994003296, + "num_tokens": 361687467.0, + "step": 13983 + }, + { + "epoch": 1.53569075334944, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4653191566467285, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7227399349212646, + "num_tokens": 361711970.0, + "step": 13984 + }, + { + "epoch": 1.5358005710520537, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.602712869644165, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7143194079399109, + "num_tokens": 361734584.0, + "step": 13985 + }, + { + "epoch": 1.5359103887546672, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2503390312194824, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.706977128982544, + "num_tokens": 361764865.0, + "step": 13986 + }, + { + "epoch": 1.5360202064572808, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2846803665161133, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.70304274559021, + "num_tokens": 361794556.0, + "step": 13987 + }, + { + "epoch": 1.5361300241598945, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.383521795272827, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.715552806854248, + "num_tokens": 361820100.0, + "step": 13988 + }, + { + "epoch": 1.5362398418625083, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3468666076660156, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7019806504249573, + "num_tokens": 361847555.0, + "step": 13989 + }, + { + "epoch": 1.536349659565122, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3584957122802734, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7156721353530884, + "num_tokens": 361873202.0, + "step": 13990 + }, + { + "epoch": 1.5364594772677356, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.207449436187744, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7184301614761353, + "num_tokens": 361900641.0, + "step": 13991 + }, + { + "epoch": 1.536569294970349, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7426693439483643, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7127010822296143, + "num_tokens": 361920081.0, + "step": 13992 + }, + { + "epoch": 1.5366791126729629, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1690902709960938, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.7035355567932129, + "num_tokens": 361951476.0, + "step": 13993 + }, + { + "epoch": 1.5367889303755766, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.447343349456787, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7073352336883545, + "num_tokens": 361978693.0, + "step": 13994 + }, + { + "epoch": 1.5368987480781902, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.660494804382324, + "learning_rate": 1e-06, + "loss": 1.1422, + "mean_token_accuracy": 0.6729174256324768, + "num_tokens": 362001740.0, + "step": 13995 + }, + { + "epoch": 1.537008565780804, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5820438861846924, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7110044956207275, + "num_tokens": 362024253.0, + "step": 13996 + }, + { + "epoch": 1.5371183834834174, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.288926362991333, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7046425342559814, + "num_tokens": 362052889.0, + "step": 13997 + }, + { + "epoch": 1.5372282011860312, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.242485523223877, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7337943315505981, + "num_tokens": 362081419.0, + "step": 13998 + }, + { + "epoch": 1.537338018888645, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.776623249053955, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7256268262863159, + "num_tokens": 362101127.0, + "step": 13999 + }, + { + "epoch": 1.5374478365912585, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3176608085632324, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7395683526992798, + "num_tokens": 362129145.0, + "step": 14000 + }, + { + "epoch": 1.537557654293872, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7984488010406494, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.724868655204773, + "num_tokens": 362148434.0, + "step": 14001 + }, + { + "epoch": 1.5376674719964858, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2409701347351074, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6974350214004517, + "num_tokens": 362177949.0, + "step": 14002 + }, + { + "epoch": 1.5377772896990995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4666953086853027, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.693342924118042, + "num_tokens": 362202940.0, + "step": 14003 + }, + { + "epoch": 1.5378871074017133, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5356528759002686, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7091596126556396, + "num_tokens": 362227009.0, + "step": 14004 + }, + { + "epoch": 1.5379969251043268, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.418731212615967, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.723060667514801, + "num_tokens": 362251623.0, + "step": 14005 + }, + { + "epoch": 1.5381067428069404, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3017501831054688, + "learning_rate": 1e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.6796166896820068, + "num_tokens": 362282130.0, + "step": 14006 + }, + { + "epoch": 1.5382165605095541, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3420207500457764, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6890110373497009, + "num_tokens": 362309947.0, + "step": 14007 + }, + { + "epoch": 1.5383263782121679, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.685265302658081, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7126660346984863, + "num_tokens": 362331758.0, + "step": 14008 + }, + { + "epoch": 1.5384361959147814, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4732179641723633, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7036934494972229, + "num_tokens": 362355589.0, + "step": 14009 + }, + { + "epoch": 1.538546013617395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.496192216873169, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7243674993515015, + "num_tokens": 362378441.0, + "step": 14010 + }, + { + "epoch": 1.5386558313200087, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2181243896484375, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.710588812828064, + "num_tokens": 362406264.0, + "step": 14011 + }, + { + "epoch": 1.5387656490226225, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.523538112640381, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7183465361595154, + "num_tokens": 362429698.0, + "step": 14012 + }, + { + "epoch": 1.5388754667252362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5376415252685547, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.726627767086029, + "num_tokens": 362451815.0, + "step": 14013 + }, + { + "epoch": 1.5389852844278498, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.518394708633423, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7007257342338562, + "num_tokens": 362475655.0, + "step": 14014 + }, + { + "epoch": 1.5390951021304633, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.129467010498047, + "learning_rate": 1e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6884755492210388, + "num_tokens": 362509816.0, + "step": 14015 + }, + { + "epoch": 1.539204919833077, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3229281902313232, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7189978361129761, + "num_tokens": 362536931.0, + "step": 14016 + }, + { + "epoch": 1.5393147375356908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6303799152374268, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7184816002845764, + "num_tokens": 362558784.0, + "step": 14017 + }, + { + "epoch": 1.5394245552383046, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.568779230117798, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.742749035358429, + "num_tokens": 362579862.0, + "step": 14018 + }, + { + "epoch": 1.539534372940918, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3635191917419434, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7339197993278503, + "num_tokens": 362604486.0, + "step": 14019 + }, + { + "epoch": 1.5396441906435316, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2068657875061035, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7202016711235046, + "num_tokens": 362632912.0, + "step": 14020 + }, + { + "epoch": 1.5397540083461454, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.312405586242676, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7238850593566895, + "num_tokens": 362658011.0, + "step": 14021 + }, + { + "epoch": 1.5398638260487592, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7545320987701416, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7009496688842773, + "num_tokens": 362678574.0, + "step": 14022 + }, + { + "epoch": 1.5399736437513727, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.326373338699341, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.696779727935791, + "num_tokens": 362707936.0, + "step": 14023 + }, + { + "epoch": 1.5400834614539862, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.367687702178955, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6931092739105225, + "num_tokens": 362736487.0, + "step": 14024 + }, + { + "epoch": 1.5401932791566, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.529243230819702, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7036141157150269, + "num_tokens": 362759878.0, + "step": 14025 + }, + { + "epoch": 1.5403030968592137, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.246281862258911, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6931166648864746, + "num_tokens": 362788417.0, + "step": 14026 + }, + { + "epoch": 1.5404129145618275, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1154823303222656, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7139793634414673, + "num_tokens": 362817928.0, + "step": 14027 + }, + { + "epoch": 1.540522732264441, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.316493034362793, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6791951656341553, + "num_tokens": 362846569.0, + "step": 14028 + }, + { + "epoch": 1.5406325499670546, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3816235065460205, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7272307276725769, + "num_tokens": 362872228.0, + "step": 14029 + }, + { + "epoch": 1.5407423676696683, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.488633155822754, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.724141001701355, + "num_tokens": 362894173.0, + "step": 14030 + }, + { + "epoch": 1.540852185372282, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.432828664779663, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7098267078399658, + "num_tokens": 362920259.0, + "step": 14031 + }, + { + "epoch": 1.5409620030748958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7385945320129395, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7361575365066528, + "num_tokens": 362938237.0, + "step": 14032 + }, + { + "epoch": 1.5410718207775094, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.494507312774658, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7113829255104065, + "num_tokens": 362961984.0, + "step": 14033 + }, + { + "epoch": 1.541181638480123, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.626386880874634, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6862441301345825, + "num_tokens": 362985064.0, + "step": 14034 + }, + { + "epoch": 1.5412914561827367, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.374715805053711, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7070010304450989, + "num_tokens": 363010611.0, + "step": 14035 + }, + { + "epoch": 1.5414012738853504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.666558027267456, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.719528317451477, + "num_tokens": 363032997.0, + "step": 14036 + }, + { + "epoch": 1.541511091587964, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.253512382507324, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6942200660705566, + "num_tokens": 363060709.0, + "step": 14037 + }, + { + "epoch": 1.5416209092905775, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.41607403755188, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7060524225234985, + "num_tokens": 363087751.0, + "step": 14038 + }, + { + "epoch": 1.5417307269931912, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.132309436798096, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7077155113220215, + "num_tokens": 363109821.0, + "step": 14039 + }, + { + "epoch": 1.541840544695805, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2553091049194336, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6951093673706055, + "num_tokens": 363140215.0, + "step": 14040 + }, + { + "epoch": 1.5419503623984188, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4430482387542725, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.7057750225067139, + "num_tokens": 363164361.0, + "step": 14041 + }, + { + "epoch": 1.5420601801010323, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0958359241485596, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6973003149032593, + "num_tokens": 363199268.0, + "step": 14042 + }, + { + "epoch": 1.5421699978036458, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2501220703125, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6918280720710754, + "num_tokens": 363229026.0, + "step": 14043 + }, + { + "epoch": 1.5422798155062596, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2696948051452637, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7189445495605469, + "num_tokens": 363257039.0, + "step": 14044 + }, + { + "epoch": 1.5423896332088733, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3251495361328125, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.696990430355072, + "num_tokens": 363282409.0, + "step": 14045 + }, + { + "epoch": 1.5424994509114869, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.338111162185669, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7034175992012024, + "num_tokens": 363311899.0, + "step": 14046 + }, + { + "epoch": 1.5426092686141006, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3180971145629883, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7156660556793213, + "num_tokens": 363337782.0, + "step": 14047 + }, + { + "epoch": 1.5427190863167142, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4283876419067383, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7215022444725037, + "num_tokens": 363361218.0, + "step": 14048 + }, + { + "epoch": 1.542828904019328, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0808963775634766, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7224664688110352, + "num_tokens": 363390987.0, + "step": 14049 + }, + { + "epoch": 1.5429387217219417, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.114431858062744, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7282131910324097, + "num_tokens": 363423598.0, + "step": 14050 + }, + { + "epoch": 1.5430485394245552, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5310609340667725, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7202259302139282, + "num_tokens": 363445154.0, + "step": 14051 + }, + { + "epoch": 1.5431583571271688, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2045018672943115, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.695766806602478, + "num_tokens": 363475840.0, + "step": 14052 + }, + { + "epoch": 1.5432681748297825, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.512812614440918, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.729902982711792, + "num_tokens": 363497933.0, + "step": 14053 + }, + { + "epoch": 1.5433779925323963, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.211339235305786, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7065569758415222, + "num_tokens": 363524024.0, + "step": 14054 + }, + { + "epoch": 1.54348781023501, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.438659191131592, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7085820436477661, + "num_tokens": 363549265.0, + "step": 14055 + }, + { + "epoch": 1.5435976279376236, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.649111032485962, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7164657115936279, + "num_tokens": 363571016.0, + "step": 14056 + }, + { + "epoch": 1.543707445640237, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.219520330429077, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6898524761199951, + "num_tokens": 363600326.0, + "step": 14057 + }, + { + "epoch": 1.5438172633428509, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3290069103240967, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7064521312713623, + "num_tokens": 363626848.0, + "step": 14058 + }, + { + "epoch": 1.5439270810454646, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.157722234725952, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7122430801391602, + "num_tokens": 363656209.0, + "step": 14059 + }, + { + "epoch": 1.5440368987480781, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2851169109344482, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7068518996238708, + "num_tokens": 363683720.0, + "step": 14060 + }, + { + "epoch": 1.544146716450692, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5797910690307617, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7060735821723938, + "num_tokens": 363705783.0, + "step": 14061 + }, + { + "epoch": 1.5442565341533054, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4295432567596436, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7103303074836731, + "num_tokens": 363729198.0, + "step": 14062 + }, + { + "epoch": 1.5443663518559192, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1896860599517822, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7296143770217896, + "num_tokens": 363757236.0, + "step": 14063 + }, + { + "epoch": 1.544476169558533, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.330146074295044, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6813986301422119, + "num_tokens": 363784799.0, + "step": 14064 + }, + { + "epoch": 1.5445859872611465, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4946861267089844, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.717298686504364, + "num_tokens": 363806497.0, + "step": 14065 + }, + { + "epoch": 1.54469580496376, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3502721786499023, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6934972405433655, + "num_tokens": 363833430.0, + "step": 14066 + }, + { + "epoch": 1.5448056226663738, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5520236492156982, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7009046077728271, + "num_tokens": 363856361.0, + "step": 14067 + }, + { + "epoch": 1.5449154403689875, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.337270975112915, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7178367376327515, + "num_tokens": 363881557.0, + "step": 14068 + }, + { + "epoch": 1.5450252580716013, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.754964828491211, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.707013726234436, + "num_tokens": 363901078.0, + "step": 14069 + }, + { + "epoch": 1.5451350757742148, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5206377506256104, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7326204180717468, + "num_tokens": 363923541.0, + "step": 14070 + }, + { + "epoch": 1.5452448934768284, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4065651893615723, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.709825336933136, + "num_tokens": 363947757.0, + "step": 14071 + }, + { + "epoch": 1.5453547111794421, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5935134887695312, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7020490169525146, + "num_tokens": 363969680.0, + "step": 14072 + }, + { + "epoch": 1.5454645288820559, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3993711471557617, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6896306872367859, + "num_tokens": 363995485.0, + "step": 14073 + }, + { + "epoch": 1.5455743465846694, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6668272018432617, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6985524892807007, + "num_tokens": 364017312.0, + "step": 14074 + }, + { + "epoch": 1.545684164287283, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.25484037399292, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7059223651885986, + "num_tokens": 364043938.0, + "step": 14075 + }, + { + "epoch": 1.5457939819898967, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.9117345809936523, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7268280386924744, + "num_tokens": 364066453.0, + "step": 14076 + }, + { + "epoch": 1.5459037996925105, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.482630491256714, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6928987503051758, + "num_tokens": 364089828.0, + "step": 14077 + }, + { + "epoch": 1.5460136173951242, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3709709644317627, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.7124221920967102, + "num_tokens": 364117049.0, + "step": 14078 + }, + { + "epoch": 1.5461234350977378, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.424905776977539, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7127717733383179, + "num_tokens": 364141653.0, + "step": 14079 + }, + { + "epoch": 1.5462332528003513, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.435609817504883, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7380672693252563, + "num_tokens": 364167485.0, + "step": 14080 + }, + { + "epoch": 1.546343070502965, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 3.0050911903381348, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7412075996398926, + "num_tokens": 364183903.0, + "step": 14081 + }, + { + "epoch": 1.5464528882055788, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.371281862258911, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6995097398757935, + "num_tokens": 364210196.0, + "step": 14082 + }, + { + "epoch": 1.5465627059081926, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.347578763961792, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7237787842750549, + "num_tokens": 364235182.0, + "step": 14083 + }, + { + "epoch": 1.546672523610806, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.430478096008301, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7077719569206238, + "num_tokens": 364260900.0, + "step": 14084 + }, + { + "epoch": 1.5467823413134196, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.464383125305176, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6899288892745972, + "num_tokens": 364288023.0, + "step": 14085 + }, + { + "epoch": 1.5468921590160334, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.6884653568267822, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7078620195388794, + "num_tokens": 364318840.0, + "step": 14086 + }, + { + "epoch": 1.5470019767186471, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3075382709503174, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7093580961227417, + "num_tokens": 364346546.0, + "step": 14087 + }, + { + "epoch": 1.5471117944212607, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8197343349456787, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7230538129806519, + "num_tokens": 364367853.0, + "step": 14088 + }, + { + "epoch": 1.5472216121238742, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.170091152191162, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7041540741920471, + "num_tokens": 364395855.0, + "step": 14089 + }, + { + "epoch": 1.547331429826488, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.447134017944336, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6858344674110413, + "num_tokens": 364421719.0, + "step": 14090 + }, + { + "epoch": 1.5474412475291017, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.718832492828369, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7264856100082397, + "num_tokens": 364444366.0, + "step": 14091 + }, + { + "epoch": 1.5475510652317155, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.500011444091797, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.7028820514678955, + "num_tokens": 364467061.0, + "step": 14092 + }, + { + "epoch": 1.547660882934329, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.568131923675537, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7215784788131714, + "num_tokens": 364488964.0, + "step": 14093 + }, + { + "epoch": 1.5477707006369426, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6936097145080566, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.705193281173706, + "num_tokens": 364511490.0, + "step": 14094 + }, + { + "epoch": 1.5478805183395563, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4724957942962646, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7167599201202393, + "num_tokens": 364537702.0, + "step": 14095 + }, + { + "epoch": 1.54799033604217, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2045013904571533, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7149561643600464, + "num_tokens": 364566065.0, + "step": 14096 + }, + { + "epoch": 1.5481001537447838, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.409663677215576, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7155197858810425, + "num_tokens": 364591670.0, + "step": 14097 + }, + { + "epoch": 1.5482099714473974, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0943920612335205, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7226434946060181, + "num_tokens": 364622290.0, + "step": 14098 + }, + { + "epoch": 1.548319789150011, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7158560752868652, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7132918834686279, + "num_tokens": 364643694.0, + "step": 14099 + }, + { + "epoch": 1.5484296068526247, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.33491587638855, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7043643593788147, + "num_tokens": 364671334.0, + "step": 14100 + }, + { + "epoch": 1.5485394245552384, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.486865758895874, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7036835551261902, + "num_tokens": 364694191.0, + "step": 14101 + }, + { + "epoch": 1.548649242257852, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.528074264526367, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7038729786872864, + "num_tokens": 364718287.0, + "step": 14102 + }, + { + "epoch": 1.5487590599604655, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4261608123779297, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7380220890045166, + "num_tokens": 364742007.0, + "step": 14103 + }, + { + "epoch": 1.5488688776630792, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2553651332855225, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7061552405357361, + "num_tokens": 364772320.0, + "step": 14104 + }, + { + "epoch": 1.548978695365693, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.51628041267395, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.714240312576294, + "num_tokens": 364794693.0, + "step": 14105 + }, + { + "epoch": 1.5490885130683067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.381269693374634, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7049939632415771, + "num_tokens": 364821681.0, + "step": 14106 + }, + { + "epoch": 1.5491983307709203, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.143561363220215, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6826575398445129, + "num_tokens": 364854392.0, + "step": 14107 + }, + { + "epoch": 1.5493081484735338, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.292673349380493, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7137264609336853, + "num_tokens": 364882341.0, + "step": 14108 + }, + { + "epoch": 1.5494179661761476, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4196479320526123, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.7046573162078857, + "num_tokens": 364905926.0, + "step": 14109 + }, + { + "epoch": 1.5495277838787613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3368468284606934, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6937031745910645, + "num_tokens": 364934990.0, + "step": 14110 + }, + { + "epoch": 1.5496376015813749, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.473773956298828, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7003738880157471, + "num_tokens": 364960789.0, + "step": 14111 + }, + { + "epoch": 1.5497474192839886, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.309889793395996, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7018432021141052, + "num_tokens": 364986802.0, + "step": 14112 + }, + { + "epoch": 1.5498572369866022, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3831138610839844, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.706546425819397, + "num_tokens": 365012297.0, + "step": 14113 + }, + { + "epoch": 1.549967054689216, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.430799961090088, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7250388264656067, + "num_tokens": 365037997.0, + "step": 14114 + }, + { + "epoch": 1.5500768723918297, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.83990478515625, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7406055927276611, + "num_tokens": 365058239.0, + "step": 14115 + }, + { + "epoch": 1.5501866900944432, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.360149383544922, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6982214450836182, + "num_tokens": 365086407.0, + "step": 14116 + }, + { + "epoch": 1.5502965077970567, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.542428493499756, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7298632860183716, + "num_tokens": 365109176.0, + "step": 14117 + }, + { + "epoch": 1.5504063254996705, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.599496841430664, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7134419679641724, + "num_tokens": 365131615.0, + "step": 14118 + }, + { + "epoch": 1.5505161432022843, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4295408725738525, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6958142518997192, + "num_tokens": 365158257.0, + "step": 14119 + }, + { + "epoch": 1.550625960904898, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.0025417804718018, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7469044327735901, + "num_tokens": 365175178.0, + "step": 14120 + }, + { + "epoch": 1.5507357786075116, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4939515590667725, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7169163823127747, + "num_tokens": 365197933.0, + "step": 14121 + }, + { + "epoch": 1.550845596310125, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.29545259475708, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6973099708557129, + "num_tokens": 365233179.0, + "step": 14122 + }, + { + "epoch": 1.5509554140127388, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.380824565887451, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.700050950050354, + "num_tokens": 365259249.0, + "step": 14123 + }, + { + "epoch": 1.5510652317153526, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.474985361099243, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7136082649230957, + "num_tokens": 365282260.0, + "step": 14124 + }, + { + "epoch": 1.5511750494179661, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.36594295501709, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7067624926567078, + "num_tokens": 365308282.0, + "step": 14125 + }, + { + "epoch": 1.55128486712058, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.338054656982422, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7131696939468384, + "num_tokens": 365335380.0, + "step": 14126 + }, + { + "epoch": 1.5513946848231934, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.433103084564209, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.7065160274505615, + "num_tokens": 365360887.0, + "step": 14127 + }, + { + "epoch": 1.5515045025258072, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.7194182872772217, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7331488132476807, + "num_tokens": 365380668.0, + "step": 14128 + }, + { + "epoch": 1.551614320228421, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2825467586517334, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7195334434509277, + "num_tokens": 365407323.0, + "step": 14129 + }, + { + "epoch": 1.5517241379310345, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.8131113052368164, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7307502627372742, + "num_tokens": 365426765.0, + "step": 14130 + }, + { + "epoch": 1.551833955633648, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.296384811401367, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7138063907623291, + "num_tokens": 365452730.0, + "step": 14131 + }, + { + "epoch": 1.5519437733362618, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.56885027885437, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7181592583656311, + "num_tokens": 365478368.0, + "step": 14132 + }, + { + "epoch": 1.5520535910388755, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.525254487991333, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.7050171494483948, + "num_tokens": 365502634.0, + "step": 14133 + }, + { + "epoch": 1.5521634087414893, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.685849905014038, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7223384380340576, + "num_tokens": 365523409.0, + "step": 14134 + }, + { + "epoch": 1.5522732264441028, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3081302642822266, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7147440910339355, + "num_tokens": 365550306.0, + "step": 14135 + }, + { + "epoch": 1.5523830441467164, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2959489822387695, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7361952066421509, + "num_tokens": 365575175.0, + "step": 14136 + }, + { + "epoch": 1.55249286184933, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.827932357788086, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7424241304397583, + "num_tokens": 365592589.0, + "step": 14137 + }, + { + "epoch": 1.5526026795519439, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.61517596244812, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7211024165153503, + "num_tokens": 365615450.0, + "step": 14138 + }, + { + "epoch": 1.5527124972545574, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3143439292907715, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7132571339607239, + "num_tokens": 365643137.0, + "step": 14139 + }, + { + "epoch": 1.552822314957171, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.263676643371582, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7158106565475464, + "num_tokens": 365670653.0, + "step": 14140 + }, + { + "epoch": 1.5529321326597847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2595560550689697, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6911612749099731, + "num_tokens": 365699304.0, + "step": 14141 + }, + { + "epoch": 1.5530419503623984, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3059868812561035, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6954652070999146, + "num_tokens": 365727050.0, + "step": 14142 + }, + { + "epoch": 1.5531517680650122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.388620376586914, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7125159502029419, + "num_tokens": 365752438.0, + "step": 14143 + }, + { + "epoch": 1.5532615857676257, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3860385417938232, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.70777827501297, + "num_tokens": 365775100.0, + "step": 14144 + }, + { + "epoch": 1.5533714034702393, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2390403747558594, + "learning_rate": 1e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6802131533622742, + "num_tokens": 365804574.0, + "step": 14145 + }, + { + "epoch": 1.553481221172853, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5289394855499268, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.69620680809021, + "num_tokens": 365827418.0, + "step": 14146 + }, + { + "epoch": 1.5535910388754668, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.614284038543701, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7442793846130371, + "num_tokens": 365847419.0, + "step": 14147 + }, + { + "epoch": 1.5537008565780805, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3163866996765137, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7146061062812805, + "num_tokens": 365874118.0, + "step": 14148 + }, + { + "epoch": 1.553810674280694, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.013779640197754, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7047497034072876, + "num_tokens": 365902006.0, + "step": 14149 + }, + { + "epoch": 1.5539204919833076, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6782913208007812, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7120256423950195, + "num_tokens": 365922732.0, + "step": 14150 + }, + { + "epoch": 1.5540303096859214, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.400405168533325, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6856788992881775, + "num_tokens": 365951237.0, + "step": 14151 + }, + { + "epoch": 1.5541401273885351, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4662230014801025, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7141405940055847, + "num_tokens": 365976298.0, + "step": 14152 + }, + { + "epoch": 1.5542499450911487, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.451997995376587, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6942549347877502, + "num_tokens": 366003615.0, + "step": 14153 + }, + { + "epoch": 1.5543597627937622, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.527343988418579, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7073675394058228, + "num_tokens": 366028463.0, + "step": 14154 + }, + { + "epoch": 1.554469580496376, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3867478370666504, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7067304849624634, + "num_tokens": 366054708.0, + "step": 14155 + }, + { + "epoch": 1.5545793981989897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.215210437774658, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.711361289024353, + "num_tokens": 366084130.0, + "step": 14156 + }, + { + "epoch": 1.5546892159016035, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7274768352508545, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7326740622520447, + "num_tokens": 366105798.0, + "step": 14157 + }, + { + "epoch": 1.554799033604217, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0841846466064453, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6965777277946472, + "num_tokens": 366138830.0, + "step": 14158 + }, + { + "epoch": 1.5549088513068305, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4853110313415527, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7239316701889038, + "num_tokens": 366164338.0, + "step": 14159 + }, + { + "epoch": 1.5550186690094443, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5884549617767334, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7084656357765198, + "num_tokens": 366188310.0, + "step": 14160 + }, + { + "epoch": 1.555128486712058, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.236478090286255, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7226032018661499, + "num_tokens": 366218560.0, + "step": 14161 + }, + { + "epoch": 1.5552383044146716, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6357572078704834, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7181359529495239, + "num_tokens": 366239808.0, + "step": 14162 + }, + { + "epoch": 1.5553481221172853, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3790431022644043, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7209395170211792, + "num_tokens": 366265356.0, + "step": 14163 + }, + { + "epoch": 1.5554579398198989, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.297868490219116, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7090850472450256, + "num_tokens": 366294312.0, + "step": 14164 + }, + { + "epoch": 1.5555677575225126, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6433956623077393, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7033368349075317, + "num_tokens": 366315345.0, + "step": 14165 + }, + { + "epoch": 1.5556775752251264, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.449420928955078, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6995829343795776, + "num_tokens": 366338596.0, + "step": 14166 + }, + { + "epoch": 1.55578739292774, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.645081043243408, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7367253303527832, + "num_tokens": 366360721.0, + "step": 14167 + }, + { + "epoch": 1.5558972106303535, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6393067836761475, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7233826518058777, + "num_tokens": 366381748.0, + "step": 14168 + }, + { + "epoch": 1.5560070283329672, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3512375354766846, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7138084173202515, + "num_tokens": 366408008.0, + "step": 14169 + }, + { + "epoch": 1.556116846035581, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.51265025138855, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7125876545906067, + "num_tokens": 366431701.0, + "step": 14170 + }, + { + "epoch": 1.5562266637381947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3354761600494385, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7189759016036987, + "num_tokens": 366457522.0, + "step": 14171 + }, + { + "epoch": 1.5563364814408083, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3784327507019043, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7286271452903748, + "num_tokens": 366481980.0, + "step": 14172 + }, + { + "epoch": 1.5564462991434218, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.686353921890259, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6894688010215759, + "num_tokens": 366504694.0, + "step": 14173 + }, + { + "epoch": 1.5565561168460356, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6236705780029297, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.70695561170578, + "num_tokens": 366526182.0, + "step": 14174 + }, + { + "epoch": 1.5566659345486493, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1382598876953125, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7221243381500244, + "num_tokens": 366555008.0, + "step": 14175 + }, + { + "epoch": 1.5567757522512629, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3229801654815674, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.715879499912262, + "num_tokens": 366580796.0, + "step": 14176 + }, + { + "epoch": 1.5568855699538766, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4187467098236084, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7045871019363403, + "num_tokens": 366606322.0, + "step": 14177 + }, + { + "epoch": 1.5569953876564901, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4626593589782715, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7139313220977783, + "num_tokens": 366629526.0, + "step": 14178 + }, + { + "epoch": 1.557105205359104, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.567721366882324, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7257959842681885, + "num_tokens": 366650121.0, + "step": 14179 + }, + { + "epoch": 1.5572150230617177, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.576988697052002, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7106911540031433, + "num_tokens": 366673256.0, + "step": 14180 + }, + { + "epoch": 1.5573248407643312, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.221431255340576, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7061175107955933, + "num_tokens": 366703456.0, + "step": 14181 + }, + { + "epoch": 1.5574346584669447, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3206593990325928, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6963323354721069, + "num_tokens": 366734352.0, + "step": 14182 + }, + { + "epoch": 1.5575444761695585, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.065160274505615, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7150654196739197, + "num_tokens": 366761205.0, + "step": 14183 + }, + { + "epoch": 1.5576542938721722, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.428176164627075, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7169266939163208, + "num_tokens": 366785165.0, + "step": 14184 + }, + { + "epoch": 1.557764111574786, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3619444370269775, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7172996997833252, + "num_tokens": 366809717.0, + "step": 14185 + }, + { + "epoch": 1.5578739292773995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3553903102874756, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7388375997543335, + "num_tokens": 366832449.0, + "step": 14186 + }, + { + "epoch": 1.557983746980013, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5315263271331787, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.71584153175354, + "num_tokens": 366857295.0, + "step": 14187 + }, + { + "epoch": 1.5580935646826268, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4145267009735107, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6829572916030884, + "num_tokens": 366885833.0, + "step": 14188 + }, + { + "epoch": 1.5582033823852406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2259716987609863, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7035117149353027, + "num_tokens": 366912767.0, + "step": 14189 + }, + { + "epoch": 1.5583132000878541, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5622520446777344, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7039779424667358, + "num_tokens": 366935518.0, + "step": 14190 + }, + { + "epoch": 1.5584230177904677, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.407017707824707, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7158709764480591, + "num_tokens": 366960821.0, + "step": 14191 + }, + { + "epoch": 1.5585328354930814, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.008990526199341, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6889967918395996, + "num_tokens": 366996506.0, + "step": 14192 + }, + { + "epoch": 1.5586426531956952, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.414292573928833, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7032451629638672, + "num_tokens": 367021721.0, + "step": 14193 + }, + { + "epoch": 1.558752470898309, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.393629789352417, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7202901840209961, + "num_tokens": 367047808.0, + "step": 14194 + }, + { + "epoch": 1.5588622886009225, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.361290454864502, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6937093734741211, + "num_tokens": 367073388.0, + "step": 14195 + }, + { + "epoch": 1.558972106303536, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.286142587661743, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6966574788093567, + "num_tokens": 367101125.0, + "step": 14196 + }, + { + "epoch": 1.5590819240061498, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.603188991546631, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6964385509490967, + "num_tokens": 367126849.0, + "step": 14197 + }, + { + "epoch": 1.5591917417087635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.532818078994751, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.703721284866333, + "num_tokens": 367150035.0, + "step": 14198 + }, + { + "epoch": 1.5593015594113773, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5216710567474365, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7313311100006104, + "num_tokens": 367171590.0, + "step": 14199 + }, + { + "epoch": 1.5594113771139908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.14493727684021, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.722087562084198, + "num_tokens": 367199356.0, + "step": 14200 + }, + { + "epoch": 1.5595211948166043, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2269668579101562, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6911137700080872, + "num_tokens": 367228959.0, + "step": 14201 + }, + { + "epoch": 1.559631012519218, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3559892177581787, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7106896638870239, + "num_tokens": 367256230.0, + "step": 14202 + }, + { + "epoch": 1.5597408302218319, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1579298973083496, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6988123059272766, + "num_tokens": 367287741.0, + "step": 14203 + }, + { + "epoch": 1.5598506479244454, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5634961128234863, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.726045548915863, + "num_tokens": 367310656.0, + "step": 14204 + }, + { + "epoch": 1.559960465627059, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3109242916107178, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7139960527420044, + "num_tokens": 367337462.0, + "step": 14205 + }, + { + "epoch": 1.5600702833296727, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3144185543060303, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7184801697731018, + "num_tokens": 367363685.0, + "step": 14206 + }, + { + "epoch": 1.5601801010322864, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.623621940612793, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7108758687973022, + "num_tokens": 367386120.0, + "step": 14207 + }, + { + "epoch": 1.5602899187349002, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.154585599899292, + "learning_rate": 1e-06, + "loss": 1.0852, + "mean_token_accuracy": 0.6805607080459595, + "num_tokens": 367417056.0, + "step": 14208 + }, + { + "epoch": 1.5603997364375137, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.319779634475708, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.714841902256012, + "num_tokens": 367442342.0, + "step": 14209 + }, + { + "epoch": 1.5605095541401273, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.053922176361084, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7143571376800537, + "num_tokens": 367472690.0, + "step": 14210 + }, + { + "epoch": 1.560619371842741, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.434098482131958, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7020184993743896, + "num_tokens": 367498412.0, + "step": 14211 + }, + { + "epoch": 1.5607291895453548, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2353386878967285, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7239299416542053, + "num_tokens": 367525955.0, + "step": 14212 + }, + { + "epoch": 1.5608390072479685, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1032378673553467, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7279537916183472, + "num_tokens": 367554794.0, + "step": 14213 + }, + { + "epoch": 1.560948824950582, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.15347957611084, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7014682292938232, + "num_tokens": 367586005.0, + "step": 14214 + }, + { + "epoch": 1.5610586426531956, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.273554563522339, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7226352691650391, + "num_tokens": 367612133.0, + "step": 14215 + }, + { + "epoch": 1.5611684603558094, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5927555561065674, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7152644991874695, + "num_tokens": 367633180.0, + "step": 14216 + }, + { + "epoch": 1.5612782780584231, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2344303131103516, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6995680332183838, + "num_tokens": 367663191.0, + "step": 14217 + }, + { + "epoch": 1.5613880957610367, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3846435546875, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6999188661575317, + "num_tokens": 367689478.0, + "step": 14218 + }, + { + "epoch": 1.5614979134636502, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3720061779022217, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7323387265205383, + "num_tokens": 367713709.0, + "step": 14219 + }, + { + "epoch": 1.561607731166264, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.220205068588257, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6939561367034912, + "num_tokens": 367744479.0, + "step": 14220 + }, + { + "epoch": 1.5617175488688777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6996147632598877, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7271231412887573, + "num_tokens": 367765403.0, + "step": 14221 + }, + { + "epoch": 1.5618273665714915, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3441081047058105, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7184804677963257, + "num_tokens": 367792333.0, + "step": 14222 + }, + { + "epoch": 1.561937184274105, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.390380382537842, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7167757749557495, + "num_tokens": 367817925.0, + "step": 14223 + }, + { + "epoch": 1.5620470019767185, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.251533031463623, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7108473777770996, + "num_tokens": 367846064.0, + "step": 14224 + }, + { + "epoch": 1.5621568196793323, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5885872840881348, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7236114740371704, + "num_tokens": 367871089.0, + "step": 14225 + }, + { + "epoch": 1.562266637381946, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.809269666671753, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7183313369750977, + "num_tokens": 367891372.0, + "step": 14226 + }, + { + "epoch": 1.5623764550845596, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5026934146881104, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7213894724845886, + "num_tokens": 367914436.0, + "step": 14227 + }, + { + "epoch": 1.5624862727871733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.839820623397827, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.740854799747467, + "num_tokens": 367931973.0, + "step": 14228 + }, + { + "epoch": 1.5625960904897869, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4631054401397705, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7156388759613037, + "num_tokens": 367957829.0, + "step": 14229 + }, + { + "epoch": 1.5627059081924006, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5591559410095215, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6900163888931274, + "num_tokens": 367981594.0, + "step": 14230 + }, + { + "epoch": 1.5628157258950144, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2556025981903076, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7112320065498352, + "num_tokens": 368012875.0, + "step": 14231 + }, + { + "epoch": 1.562925543597628, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.363356590270996, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7162313461303711, + "num_tokens": 368038115.0, + "step": 14232 + }, + { + "epoch": 1.5630353613002415, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.371338129043579, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6894105672836304, + "num_tokens": 368064082.0, + "step": 14233 + }, + { + "epoch": 1.5631451790028552, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.6228535175323486, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7148650884628296, + "num_tokens": 368085281.0, + "step": 14234 + }, + { + "epoch": 1.563254996705469, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5724689960479736, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7284855246543884, + "num_tokens": 368108808.0, + "step": 14235 + }, + { + "epoch": 1.5633648144080827, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.692225694656372, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7472845911979675, + "num_tokens": 368127261.0, + "step": 14236 + }, + { + "epoch": 1.5634746321106963, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.293686628341675, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7372140288352966, + "num_tokens": 368152865.0, + "step": 14237 + }, + { + "epoch": 1.5635844498133098, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2019059658050537, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6974055767059326, + "num_tokens": 368183795.0, + "step": 14238 + }, + { + "epoch": 1.5636942675159236, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.32725191116333, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6995890736579895, + "num_tokens": 368209576.0, + "step": 14239 + }, + { + "epoch": 1.5638040852185373, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1637351512908936, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7170052528381348, + "num_tokens": 368240376.0, + "step": 14240 + }, + { + "epoch": 1.5639139029211508, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1949758529663086, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.714724600315094, + "num_tokens": 368269971.0, + "step": 14241 + }, + { + "epoch": 1.5640237206237646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.914924144744873, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7229045033454895, + "num_tokens": 368288264.0, + "step": 14242 + }, + { + "epoch": 1.5641335383263781, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3814165592193604, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7213096618652344, + "num_tokens": 368313174.0, + "step": 14243 + }, + { + "epoch": 1.564243356028992, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3973963260650635, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7141707539558411, + "num_tokens": 368338117.0, + "step": 14244 + }, + { + "epoch": 1.5643531737316057, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3494834899902344, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7219064235687256, + "num_tokens": 368363618.0, + "step": 14245 + }, + { + "epoch": 1.5644629914342192, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6955316066741943, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6908353567123413, + "num_tokens": 368384049.0, + "step": 14246 + }, + { + "epoch": 1.5645728091368327, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.748911142349243, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7328787446022034, + "num_tokens": 368404667.0, + "step": 14247 + }, + { + "epoch": 1.5646826268394465, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4129581451416016, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7141127586364746, + "num_tokens": 368429066.0, + "step": 14248 + }, + { + "epoch": 1.5647924445420602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4195919036865234, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7004481554031372, + "num_tokens": 368455380.0, + "step": 14249 + }, + { + "epoch": 1.564902262244674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6990301609039307, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7210371494293213, + "num_tokens": 368474136.0, + "step": 14250 + }, + { + "epoch": 1.5650120799472875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0873472690582275, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6992298364639282, + "num_tokens": 368505615.0, + "step": 14251 + }, + { + "epoch": 1.565121897649901, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6219873428344727, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6958650350570679, + "num_tokens": 368530575.0, + "step": 14252 + }, + { + "epoch": 1.5652317153525148, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7261719703674316, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7190126776695251, + "num_tokens": 368550127.0, + "step": 14253 + }, + { + "epoch": 1.5653415330551286, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6881296634674072, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7259779572486877, + "num_tokens": 368570110.0, + "step": 14254 + }, + { + "epoch": 1.565451350757742, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.375763177871704, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7487152814865112, + "num_tokens": 368593773.0, + "step": 14255 + }, + { + "epoch": 1.5655611684603556, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.351470947265625, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7332473397254944, + "num_tokens": 368620602.0, + "step": 14256 + }, + { + "epoch": 1.5656709861629694, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.493042230606079, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7208914160728455, + "num_tokens": 368645461.0, + "step": 14257 + }, + { + "epoch": 1.5657808038655832, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3531620502471924, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.697655200958252, + "num_tokens": 368671850.0, + "step": 14258 + }, + { + "epoch": 1.565890621568197, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3639800548553467, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.717739462852478, + "num_tokens": 368697958.0, + "step": 14259 + }, + { + "epoch": 1.5660004392708105, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.349041700363159, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7039562463760376, + "num_tokens": 368725058.0, + "step": 14260 + }, + { + "epoch": 1.566110256973424, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.8711729049682617, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6952353119850159, + "num_tokens": 368750045.0, + "step": 14261 + }, + { + "epoch": 1.5662200746760377, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3148605823516846, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7004299163818359, + "num_tokens": 368773970.0, + "step": 14262 + }, + { + "epoch": 1.5663298923786515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.463123321533203, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7218310832977295, + "num_tokens": 368797353.0, + "step": 14263 + }, + { + "epoch": 1.5664397100812653, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2934372425079346, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7111622095108032, + "num_tokens": 368822956.0, + "step": 14264 + }, + { + "epoch": 1.5665495277838788, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.779404878616333, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6944200992584229, + "num_tokens": 368844391.0, + "step": 14265 + }, + { + "epoch": 1.5666593454864923, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1944100856781006, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7192196846008301, + "num_tokens": 368871579.0, + "step": 14266 + }, + { + "epoch": 1.566769163189106, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4806578159332275, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7544074058532715, + "num_tokens": 368892953.0, + "step": 14267 + }, + { + "epoch": 1.5668789808917198, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4660825729370117, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7265053987503052, + "num_tokens": 368915633.0, + "step": 14268 + }, + { + "epoch": 1.5669887985943334, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5808486938476562, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7419650554656982, + "num_tokens": 368937851.0, + "step": 14269 + }, + { + "epoch": 1.567098616296947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5406455993652344, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7187641859054565, + "num_tokens": 368959935.0, + "step": 14270 + }, + { + "epoch": 1.5672084339995607, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5327913761138916, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6872004270553589, + "num_tokens": 368983887.0, + "step": 14271 + }, + { + "epoch": 1.5673182517021744, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5377955436706543, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6991605758666992, + "num_tokens": 369007119.0, + "step": 14272 + }, + { + "epoch": 1.5674280694047882, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.205392360687256, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7081596851348877, + "num_tokens": 369038244.0, + "step": 14273 + }, + { + "epoch": 1.5675378871074017, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3473117351531982, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7111178636550903, + "num_tokens": 369064040.0, + "step": 14274 + }, + { + "epoch": 1.5676477048100153, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.9802626371383667, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.717837393283844, + "num_tokens": 369099983.0, + "step": 14275 + }, + { + "epoch": 1.567757522512629, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.479825735092163, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7102718949317932, + "num_tokens": 369124905.0, + "step": 14276 + }, + { + "epoch": 1.5678673402152428, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.327375888824463, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6972931623458862, + "num_tokens": 369152553.0, + "step": 14277 + }, + { + "epoch": 1.5679771579178565, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6007914543151855, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.728164553642273, + "num_tokens": 369173755.0, + "step": 14278 + }, + { + "epoch": 1.56808697562047, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0473647117614746, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7213469743728638, + "num_tokens": 369204835.0, + "step": 14279 + }, + { + "epoch": 1.5681967933230836, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4508235454559326, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6915187835693359, + "num_tokens": 369229517.0, + "step": 14280 + }, + { + "epoch": 1.5683066110256974, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4734106063842773, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.7000044584274292, + "num_tokens": 369251994.0, + "step": 14281 + }, + { + "epoch": 1.568416428728311, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.277303457260132, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6993868350982666, + "num_tokens": 369278656.0, + "step": 14282 + }, + { + "epoch": 1.5685262464309246, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5368566513061523, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7030071020126343, + "num_tokens": 369307060.0, + "step": 14283 + }, + { + "epoch": 1.5686360641335382, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2963035106658936, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6995752453804016, + "num_tokens": 369334937.0, + "step": 14284 + }, + { + "epoch": 1.568745881836152, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5649404525756836, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.71144038438797, + "num_tokens": 369359214.0, + "step": 14285 + }, + { + "epoch": 1.5688556995387657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3112707138061523, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7219526171684265, + "num_tokens": 369387553.0, + "step": 14286 + }, + { + "epoch": 1.5689655172413794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.403350830078125, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.717060923576355, + "num_tokens": 369411021.0, + "step": 14287 + }, + { + "epoch": 1.569075334943993, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.553093671798706, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7426896095275879, + "num_tokens": 369433815.0, + "step": 14288 + }, + { + "epoch": 1.5691851526466065, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5956897735595703, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7142133712768555, + "num_tokens": 369456333.0, + "step": 14289 + }, + { + "epoch": 1.5692949703492203, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.447404146194458, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7118308544158936, + "num_tokens": 369479927.0, + "step": 14290 + }, + { + "epoch": 1.569404788051834, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2726175785064697, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.7025511264801025, + "num_tokens": 369509725.0, + "step": 14291 + }, + { + "epoch": 1.5695146057544476, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4310479164123535, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7167479991912842, + "num_tokens": 369533129.0, + "step": 14292 + }, + { + "epoch": 1.5696244234570613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.105912685394287, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6981186866760254, + "num_tokens": 369566005.0, + "step": 14293 + }, + { + "epoch": 1.5697342411596749, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3756372928619385, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.69920814037323, + "num_tokens": 369594485.0, + "step": 14294 + }, + { + "epoch": 1.5698440588622886, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.719473361968994, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7176294922828674, + "num_tokens": 369616813.0, + "step": 14295 + }, + { + "epoch": 1.5699538765649024, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.355428457260132, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7027014493942261, + "num_tokens": 369644035.0, + "step": 14296 + }, + { + "epoch": 1.570063694267516, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.356187582015991, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7057633399963379, + "num_tokens": 369671675.0, + "step": 14297 + }, + { + "epoch": 1.5701735119701294, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.387502431869507, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7276874780654907, + "num_tokens": 369697141.0, + "step": 14298 + }, + { + "epoch": 1.5702833296727432, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.302187204360962, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7128299474716187, + "num_tokens": 369723377.0, + "step": 14299 + }, + { + "epoch": 1.570393147375357, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.580098867416382, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7085115909576416, + "num_tokens": 369746629.0, + "step": 14300 + }, + { + "epoch": 1.5705029650779707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4116241931915283, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7170275449752808, + "num_tokens": 369772301.0, + "step": 14301 + }, + { + "epoch": 1.5706127827805842, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5719943046569824, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.710394024848938, + "num_tokens": 369794383.0, + "step": 14302 + }, + { + "epoch": 1.5707226004831978, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3380236625671387, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7150180339813232, + "num_tokens": 369820624.0, + "step": 14303 + }, + { + "epoch": 1.5708324181858115, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3945600986480713, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7025670409202576, + "num_tokens": 369847660.0, + "step": 14304 + }, + { + "epoch": 1.5709422358884253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.699812412261963, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7168832421302795, + "num_tokens": 369869638.0, + "step": 14305 + }, + { + "epoch": 1.5710520535910388, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.262885332107544, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7377282381057739, + "num_tokens": 369894309.0, + "step": 14306 + }, + { + "epoch": 1.5711618712936526, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3797829151153564, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7147665023803711, + "num_tokens": 369921955.0, + "step": 14307 + }, + { + "epoch": 1.5712716889962661, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3754923343658447, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.708746075630188, + "num_tokens": 369947219.0, + "step": 14308 + }, + { + "epoch": 1.5713815066988799, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3901546001434326, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7373037338256836, + "num_tokens": 369972667.0, + "step": 14309 + }, + { + "epoch": 1.5714913244014936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4181113243103027, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7178450226783752, + "num_tokens": 369998788.0, + "step": 14310 + }, + { + "epoch": 1.5716011421041072, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.084071636199951, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7485464811325073, + "num_tokens": 370029169.0, + "step": 14311 + }, + { + "epoch": 1.5717109598067207, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.451263666152954, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.707317590713501, + "num_tokens": 370053808.0, + "step": 14312 + }, + { + "epoch": 1.5718207775093345, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.451294422149658, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7125560641288757, + "num_tokens": 370078890.0, + "step": 14313 + }, + { + "epoch": 1.5719305952119482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.54677414894104, + "learning_rate": 1e-06, + "loss": 1.1211, + "mean_token_accuracy": 0.6730948686599731, + "num_tokens": 370104070.0, + "step": 14314 + }, + { + "epoch": 1.572040412914562, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5305230617523193, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7208782434463501, + "num_tokens": 370128170.0, + "step": 14315 + }, + { + "epoch": 1.5721502306171755, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.154329538345337, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7127954959869385, + "num_tokens": 370159853.0, + "step": 14316 + }, + { + "epoch": 1.572260048319789, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2584149837493896, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7170364856719971, + "num_tokens": 370188294.0, + "step": 14317 + }, + { + "epoch": 1.5723698660224028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4475646018981934, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6890875697135925, + "num_tokens": 370213186.0, + "step": 14318 + }, + { + "epoch": 1.5724796837250166, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2243080139160156, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7166042923927307, + "num_tokens": 370242167.0, + "step": 14319 + }, + { + "epoch": 1.57258950142763, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0214414596557617, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7137258648872375, + "num_tokens": 370275462.0, + "step": 14320 + }, + { + "epoch": 1.5726993191302436, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.347648859024048, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.712565541267395, + "num_tokens": 370301238.0, + "step": 14321 + }, + { + "epoch": 1.5728091368328574, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5810294151306152, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7179440855979919, + "num_tokens": 370322420.0, + "step": 14322 + }, + { + "epoch": 1.5729189545354711, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3731648921966553, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6831290125846863, + "num_tokens": 370350201.0, + "step": 14323 + }, + { + "epoch": 1.573028772238085, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3096508979797363, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6886625289916992, + "num_tokens": 370379132.0, + "step": 14324 + }, + { + "epoch": 1.5731385899406984, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5120155811309814, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6911468505859375, + "num_tokens": 370403156.0, + "step": 14325 + }, + { + "epoch": 1.573248407643312, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.728135347366333, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7185757160186768, + "num_tokens": 370425779.0, + "step": 14326 + }, + { + "epoch": 1.5733582253459257, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3531248569488525, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7130395770072937, + "num_tokens": 370456410.0, + "step": 14327 + }, + { + "epoch": 1.5734680430485395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3274452686309814, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7282276749610901, + "num_tokens": 370482807.0, + "step": 14328 + }, + { + "epoch": 1.5735778607511532, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4363787174224854, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7159188985824585, + "num_tokens": 370504454.0, + "step": 14329 + }, + { + "epoch": 1.5736876784537668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3098509311676025, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6974162459373474, + "num_tokens": 370531949.0, + "step": 14330 + }, + { + "epoch": 1.5737974961563803, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.403776168823242, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7221845984458923, + "num_tokens": 370557677.0, + "step": 14331 + }, + { + "epoch": 1.573907313858994, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.540030002593994, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6904716491699219, + "num_tokens": 370584060.0, + "step": 14332 + }, + { + "epoch": 1.5740171315616078, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4186649322509766, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7191058397293091, + "num_tokens": 370608927.0, + "step": 14333 + }, + { + "epoch": 1.5741269492642214, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.372692346572876, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7139194011688232, + "num_tokens": 370634430.0, + "step": 14334 + }, + { + "epoch": 1.574236766966835, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5295305252075195, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7193351984024048, + "num_tokens": 370655568.0, + "step": 14335 + }, + { + "epoch": 1.5743465846694487, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3863914012908936, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7046958804130554, + "num_tokens": 370681602.0, + "step": 14336 + }, + { + "epoch": 1.5744564023720624, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.006927251815796, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7111040353775024, + "num_tokens": 370713455.0, + "step": 14337 + }, + { + "epoch": 1.5745662200746762, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3234353065490723, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7088310718536377, + "num_tokens": 370739938.0, + "step": 14338 + }, + { + "epoch": 1.5746760377772897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4675772190093994, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.707823634147644, + "num_tokens": 370763405.0, + "step": 14339 + }, + { + "epoch": 1.5747858554799032, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1666178703308105, + "learning_rate": 1e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.677399754524231, + "num_tokens": 370795013.0, + "step": 14340 + }, + { + "epoch": 1.574895673182517, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.228749990463257, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7092852592468262, + "num_tokens": 370821450.0, + "step": 14341 + }, + { + "epoch": 1.5750054908851308, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4227306842803955, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7122800946235657, + "num_tokens": 370846368.0, + "step": 14342 + }, + { + "epoch": 1.5751153085877443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3331243991851807, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7192516326904297, + "num_tokens": 370872383.0, + "step": 14343 + }, + { + "epoch": 1.575225126290358, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3610212802886963, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6967377066612244, + "num_tokens": 370901133.0, + "step": 14344 + }, + { + "epoch": 1.5753349439929716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4476242065429688, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6911278963088989, + "num_tokens": 370926893.0, + "step": 14345 + }, + { + "epoch": 1.5754447616955853, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3456149101257324, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7175013422966003, + "num_tokens": 370953501.0, + "step": 14346 + }, + { + "epoch": 1.575554579398199, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1209888458251953, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7214357852935791, + "num_tokens": 370983747.0, + "step": 14347 + }, + { + "epoch": 1.5756643971008126, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2816879749298096, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7080271244049072, + "num_tokens": 371012900.0, + "step": 14348 + }, + { + "epoch": 1.5757742148034262, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.506742238998413, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7083962559700012, + "num_tokens": 371037220.0, + "step": 14349 + }, + { + "epoch": 1.57588403250604, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.655031681060791, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7048139572143555, + "num_tokens": 371059267.0, + "step": 14350 + }, + { + "epoch": 1.5759938502086537, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.262138605117798, + "learning_rate": 1e-06, + "loss": 1.1264, + "mean_token_accuracy": 0.6773486733436584, + "num_tokens": 371090398.0, + "step": 14351 + }, + { + "epoch": 1.5761036679112674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.307152509689331, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6925029158592224, + "num_tokens": 371119403.0, + "step": 14352 + }, + { + "epoch": 1.576213485613881, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.077331304550171, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6992329359054565, + "num_tokens": 371154075.0, + "step": 14353 + }, + { + "epoch": 1.5763233033164945, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5288166999816895, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.712275505065918, + "num_tokens": 371176912.0, + "step": 14354 + }, + { + "epoch": 1.5764331210191083, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.2775745391845703, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.7025558948516846, + "num_tokens": 371202272.0, + "step": 14355 + }, + { + "epoch": 1.576542938721722, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.36122727394104, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7144922018051147, + "num_tokens": 371227268.0, + "step": 14356 + }, + { + "epoch": 1.5766527564243356, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.517897367477417, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7245848178863525, + "num_tokens": 371252192.0, + "step": 14357 + }, + { + "epoch": 1.5767625741269493, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.5669827461242676, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7142993211746216, + "num_tokens": 371274291.0, + "step": 14358 + }, + { + "epoch": 1.5768723918295628, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.281744956970215, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6909909248352051, + "num_tokens": 371304575.0, + "step": 14359 + }, + { + "epoch": 1.5769822095321766, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3464255332946777, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.721213161945343, + "num_tokens": 371333681.0, + "step": 14360 + }, + { + "epoch": 1.5770920272347904, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1592564582824707, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.7044597268104553, + "num_tokens": 371367450.0, + "step": 14361 + }, + { + "epoch": 1.577201844937404, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.336266279220581, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7042311429977417, + "num_tokens": 371393930.0, + "step": 14362 + }, + { + "epoch": 1.5773116626400174, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.559753656387329, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7090627551078796, + "num_tokens": 371415346.0, + "step": 14363 + }, + { + "epoch": 1.5774214803426312, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5395588874816895, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6984456181526184, + "num_tokens": 371436946.0, + "step": 14364 + }, + { + "epoch": 1.577531298045245, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.277708053588867, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7029217481613159, + "num_tokens": 371466547.0, + "step": 14365 + }, + { + "epoch": 1.5776411157478587, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3193836212158203, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7180649042129517, + "num_tokens": 371498664.0, + "step": 14366 + }, + { + "epoch": 1.5777509334504722, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3016152381896973, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.717642068862915, + "num_tokens": 371526832.0, + "step": 14367 + }, + { + "epoch": 1.5778607511530858, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2116353511810303, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7131282091140747, + "num_tokens": 371556697.0, + "step": 14368 + }, + { + "epoch": 1.5779705688556995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1185646057128906, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6957271099090576, + "num_tokens": 371587747.0, + "step": 14369 + }, + { + "epoch": 1.5780803865583133, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.463907241821289, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.704896867275238, + "num_tokens": 371611011.0, + "step": 14370 + }, + { + "epoch": 1.5781902042609268, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.363935708999634, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7053123116493225, + "num_tokens": 371636310.0, + "step": 14371 + }, + { + "epoch": 1.5783000219635406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2426862716674805, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7027795314788818, + "num_tokens": 371664235.0, + "step": 14372 + }, + { + "epoch": 1.5784098396661541, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4085614681243896, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7058360576629639, + "num_tokens": 371689834.0, + "step": 14373 + }, + { + "epoch": 1.5785196573687679, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.0105528831481934, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6957777738571167, + "num_tokens": 371725226.0, + "step": 14374 + }, + { + "epoch": 1.5786294750713816, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.4985320568084717, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.731322169303894, + "num_tokens": 371748342.0, + "step": 14375 + }, + { + "epoch": 1.5787392927739952, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3549916744232178, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7104970216751099, + "num_tokens": 371771183.0, + "step": 14376 + }, + { + "epoch": 1.5788491104766087, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.438199520111084, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7059751749038696, + "num_tokens": 371797106.0, + "step": 14377 + }, + { + "epoch": 1.5789589281792225, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.345576763153076, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6970233917236328, + "num_tokens": 371824608.0, + "step": 14378 + }, + { + "epoch": 1.5790687458818362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5034985542297363, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7059340476989746, + "num_tokens": 371849493.0, + "step": 14379 + }, + { + "epoch": 1.57917856358445, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2506628036499023, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7166275382041931, + "num_tokens": 371877672.0, + "step": 14380 + }, + { + "epoch": 1.5792883812870635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.166886329650879, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6740661859512329, + "num_tokens": 371909281.0, + "step": 14381 + }, + { + "epoch": 1.579398198989677, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.514655351638794, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.702340841293335, + "num_tokens": 371932126.0, + "step": 14382 + }, + { + "epoch": 1.5795080166922908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6331405639648438, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7328330874443054, + "num_tokens": 371952465.0, + "step": 14383 + }, + { + "epoch": 1.5796178343949046, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8302197456359863, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.730131983757019, + "num_tokens": 371971853.0, + "step": 14384 + }, + { + "epoch": 1.579727652097518, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.41544246673584, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6889185309410095, + "num_tokens": 371999889.0, + "step": 14385 + }, + { + "epoch": 1.5798374698001316, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1625750064849854, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6954692006111145, + "num_tokens": 372031413.0, + "step": 14386 + }, + { + "epoch": 1.5799472875027454, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.419133424758911, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7193505764007568, + "num_tokens": 372055163.0, + "step": 14387 + }, + { + "epoch": 1.5800571052053591, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3884878158569336, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6967030763626099, + "num_tokens": 372081104.0, + "step": 14388 + }, + { + "epoch": 1.580166922907973, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5422701835632324, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.719436764717102, + "num_tokens": 372105152.0, + "step": 14389 + }, + { + "epoch": 1.5802767406105864, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.273266553878784, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7049723863601685, + "num_tokens": 372134673.0, + "step": 14390 + }, + { + "epoch": 1.5803865583132, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.603663921356201, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7105029225349426, + "num_tokens": 372157655.0, + "step": 14391 + }, + { + "epoch": 1.5804963760158137, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2714121341705322, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7244064807891846, + "num_tokens": 372181746.0, + "step": 14392 + }, + { + "epoch": 1.5806061937184275, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.415287494659424, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6964684724807739, + "num_tokens": 372209356.0, + "step": 14393 + }, + { + "epoch": 1.5807160114210412, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4282820224761963, + "learning_rate": 1e-06, + "loss": 1.0915, + "mean_token_accuracy": 0.685308039188385, + "num_tokens": 372233574.0, + "step": 14394 + }, + { + "epoch": 1.5808258291236548, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.440504312515259, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.696776807308197, + "num_tokens": 372261343.0, + "step": 14395 + }, + { + "epoch": 1.5809356468262683, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5400192737579346, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7083874344825745, + "num_tokens": 372284609.0, + "step": 14396 + }, + { + "epoch": 1.581045464528882, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5023117065429688, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7158045172691345, + "num_tokens": 372309407.0, + "step": 14397 + }, + { + "epoch": 1.5811552822314958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.73113751411438, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7216935753822327, + "num_tokens": 372330438.0, + "step": 14398 + }, + { + "epoch": 1.5812650999341094, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.515347719192505, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7154513597488403, + "num_tokens": 372358343.0, + "step": 14399 + }, + { + "epoch": 1.5813749176367229, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7145962715148926, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.736746609210968, + "num_tokens": 372377730.0, + "step": 14400 + }, + { + "epoch": 1.5814847353393366, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.224590301513672, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6973444223403931, + "num_tokens": 372410573.0, + "step": 14401 + }, + { + "epoch": 1.5815945530419504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.258511781692505, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6925392150878906, + "num_tokens": 372438940.0, + "step": 14402 + }, + { + "epoch": 1.5817043707445642, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.530569553375244, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.702073335647583, + "num_tokens": 372462985.0, + "step": 14403 + }, + { + "epoch": 1.5818141884471777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5568909645080566, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7060796022415161, + "num_tokens": 372488502.0, + "step": 14404 + }, + { + "epoch": 1.5819240061497912, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.38085675239563, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7027266025543213, + "num_tokens": 372514893.0, + "step": 14405 + }, + { + "epoch": 1.582033823852405, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.503340244293213, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.707310676574707, + "num_tokens": 372538957.0, + "step": 14406 + }, + { + "epoch": 1.5821436415550187, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.220015048980713, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6953856945037842, + "num_tokens": 372568819.0, + "step": 14407 + }, + { + "epoch": 1.5822534592576323, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2370100021362305, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6851403713226318, + "num_tokens": 372599091.0, + "step": 14408 + }, + { + "epoch": 1.582363276960246, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.22283935546875, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.73139488697052, + "num_tokens": 372627719.0, + "step": 14409 + }, + { + "epoch": 1.5824730946628596, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.559429168701172, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.695858359336853, + "num_tokens": 372650903.0, + "step": 14410 + }, + { + "epoch": 1.5825829123654733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.424187183380127, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7064685821533203, + "num_tokens": 372678165.0, + "step": 14411 + }, + { + "epoch": 1.582692730068087, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.161005973815918, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6939716339111328, + "num_tokens": 372713150.0, + "step": 14412 + }, + { + "epoch": 1.5828025477707006, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3261353969573975, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7038745880126953, + "num_tokens": 372739846.0, + "step": 14413 + }, + { + "epoch": 1.5829123654733142, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3932206630706787, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7000064849853516, + "num_tokens": 372765815.0, + "step": 14414 + }, + { + "epoch": 1.583022183175928, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.200221538543701, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6907212734222412, + "num_tokens": 372797191.0, + "step": 14415 + }, + { + "epoch": 1.5831320008785417, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3292999267578125, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7189866900444031, + "num_tokens": 372824195.0, + "step": 14416 + }, + { + "epoch": 1.5832418185811554, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5957980155944824, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.704484760761261, + "num_tokens": 372847841.0, + "step": 14417 + }, + { + "epoch": 1.583351636283769, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.772127151489258, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7194424867630005, + "num_tokens": 372867002.0, + "step": 14418 + }, + { + "epoch": 1.5834614539863825, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.917738676071167, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.704877495765686, + "num_tokens": 372892138.0, + "step": 14419 + }, + { + "epoch": 1.5835712716889963, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.333120107650757, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7079638838768005, + "num_tokens": 372917333.0, + "step": 14420 + }, + { + "epoch": 1.58368108939161, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.471742868423462, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7174198627471924, + "num_tokens": 372941779.0, + "step": 14421 + }, + { + "epoch": 1.5837909070942235, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5349998474121094, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7101653814315796, + "num_tokens": 372962844.0, + "step": 14422 + }, + { + "epoch": 1.5839007247968373, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2947609424591064, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6978727579116821, + "num_tokens": 372989413.0, + "step": 14423 + }, + { + "epoch": 1.5840105424994508, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4092326164245605, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7144104242324829, + "num_tokens": 373014865.0, + "step": 14424 + }, + { + "epoch": 1.5841203602020646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.256838083267212, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6941924095153809, + "num_tokens": 373043360.0, + "step": 14425 + }, + { + "epoch": 1.5842301779046783, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2642953395843506, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7031965255737305, + "num_tokens": 373071402.0, + "step": 14426 + }, + { + "epoch": 1.5843399956072919, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.721608877182007, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7054731845855713, + "num_tokens": 373092339.0, + "step": 14427 + }, + { + "epoch": 1.5844498133099054, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.612920045852661, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7034814953804016, + "num_tokens": 373113931.0, + "step": 14428 + }, + { + "epoch": 1.5845596310125192, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.491581916809082, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7248613238334656, + "num_tokens": 373137454.0, + "step": 14429 + }, + { + "epoch": 1.584669448715133, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2631983757019043, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6926417350769043, + "num_tokens": 373163741.0, + "step": 14430 + }, + { + "epoch": 1.5847792664177467, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3108184337615967, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7044093608856201, + "num_tokens": 373191778.0, + "step": 14431 + }, + { + "epoch": 1.5848890841203602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4437031745910645, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7006608247756958, + "num_tokens": 373215503.0, + "step": 14432 + }, + { + "epoch": 1.5849989018229738, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4916012287139893, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7101137638092041, + "num_tokens": 373239727.0, + "step": 14433 + }, + { + "epoch": 1.5851087195255875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.329207181930542, + "learning_rate": 1e-06, + "loss": 1.0856, + "mean_token_accuracy": 0.6826656460762024, + "num_tokens": 373268032.0, + "step": 14434 + }, + { + "epoch": 1.5852185372282013, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.279177188873291, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7047787308692932, + "num_tokens": 373295587.0, + "step": 14435 + }, + { + "epoch": 1.5853283549308148, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6597321033477783, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7049091458320618, + "num_tokens": 373316872.0, + "step": 14436 + }, + { + "epoch": 1.5854381726334283, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5539498329162598, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7286706566810608, + "num_tokens": 373337786.0, + "step": 14437 + }, + { + "epoch": 1.585547990336042, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5978899002075195, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7099020481109619, + "num_tokens": 373359915.0, + "step": 14438 + }, + { + "epoch": 1.5856578080386559, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.476742744445801, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7129669189453125, + "num_tokens": 373384787.0, + "step": 14439 + }, + { + "epoch": 1.5857676257412696, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2020106315612793, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.713941216468811, + "num_tokens": 373414612.0, + "step": 14440 + }, + { + "epoch": 1.5858774434438832, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4851348400115967, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.7102502584457397, + "num_tokens": 373439049.0, + "step": 14441 + }, + { + "epoch": 1.5859872611464967, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1202232837677, + "learning_rate": 1e-06, + "loss": 1.0923, + "mean_token_accuracy": 0.6824396848678589, + "num_tokens": 373474120.0, + "step": 14442 + }, + { + "epoch": 1.5860970788491104, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.888512372970581, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7308940887451172, + "num_tokens": 373497659.0, + "step": 14443 + }, + { + "epoch": 1.5862068965517242, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3464910984039307, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7305653095245361, + "num_tokens": 373522404.0, + "step": 14444 + }, + { + "epoch": 1.586316714254338, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2597835063934326, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7287426590919495, + "num_tokens": 373549575.0, + "step": 14445 + }, + { + "epoch": 1.5864265319569515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4916765689849854, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7314803004264832, + "num_tokens": 373572238.0, + "step": 14446 + }, + { + "epoch": 1.586536349659565, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.432371139526367, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7103317975997925, + "num_tokens": 373597048.0, + "step": 14447 + }, + { + "epoch": 1.5866461673621788, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4750967025756836, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7108827233314514, + "num_tokens": 373620945.0, + "step": 14448 + }, + { + "epoch": 1.5867559850647925, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5786900520324707, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7043942213058472, + "num_tokens": 373644170.0, + "step": 14449 + }, + { + "epoch": 1.586865802767406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6849205493927, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7042292952537537, + "num_tokens": 373665553.0, + "step": 14450 + }, + { + "epoch": 1.5869756204700196, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.401911973953247, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7016607522964478, + "num_tokens": 373693133.0, + "step": 14451 + }, + { + "epoch": 1.5870854381726334, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4359054565429688, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7101254463195801, + "num_tokens": 373716781.0, + "step": 14452 + }, + { + "epoch": 1.5871952558752471, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4545209407806396, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.711922287940979, + "num_tokens": 373742232.0, + "step": 14453 + }, + { + "epoch": 1.5873050735778609, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.352316379547119, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.70760178565979, + "num_tokens": 373768072.0, + "step": 14454 + }, + { + "epoch": 1.5874148912804744, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.369723320007324, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7148295640945435, + "num_tokens": 373794979.0, + "step": 14455 + }, + { + "epoch": 1.587524708983088, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.532566785812378, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7169857621192932, + "num_tokens": 373817546.0, + "step": 14456 + }, + { + "epoch": 1.5876345266857017, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.240823268890381, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.712971568107605, + "num_tokens": 373845958.0, + "step": 14457 + }, + { + "epoch": 1.5877443443883155, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.590843915939331, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7215161323547363, + "num_tokens": 373869453.0, + "step": 14458 + }, + { + "epoch": 1.5878541620909292, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4148268699645996, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7071604132652283, + "num_tokens": 373894136.0, + "step": 14459 + }, + { + "epoch": 1.5879639797935428, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.68027400970459, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7048301696777344, + "num_tokens": 373914939.0, + "step": 14460 + }, + { + "epoch": 1.5880737974961563, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.351612091064453, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6846387386322021, + "num_tokens": 373942048.0, + "step": 14461 + }, + { + "epoch": 1.58818361519877, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3111743927001953, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7084057331085205, + "num_tokens": 373971014.0, + "step": 14462 + }, + { + "epoch": 1.5882934329013838, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3743996620178223, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7511610984802246, + "num_tokens": 373994789.0, + "step": 14463 + }, + { + "epoch": 1.5884032506039973, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.492631673812866, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7198233604431152, + "num_tokens": 374016747.0, + "step": 14464 + }, + { + "epoch": 1.5885130683066109, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6174166202545166, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7233306169509888, + "num_tokens": 374039477.0, + "step": 14465 + }, + { + "epoch": 1.5886228860092246, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3311798572540283, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6995673179626465, + "num_tokens": 374068549.0, + "step": 14466 + }, + { + "epoch": 1.5887327037118384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6018736362457275, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.723283588886261, + "num_tokens": 374090115.0, + "step": 14467 + }, + { + "epoch": 1.5888425214144521, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.564281463623047, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.700648844242096, + "num_tokens": 374114433.0, + "step": 14468 + }, + { + "epoch": 1.5889523391170657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2497594356536865, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7037376165390015, + "num_tokens": 374144386.0, + "step": 14469 + }, + { + "epoch": 1.5890621568196792, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5413782596588135, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7255626916885376, + "num_tokens": 374166726.0, + "step": 14470 + }, + { + "epoch": 1.589171974522293, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2171642780303955, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7050074338912964, + "num_tokens": 374197369.0, + "step": 14471 + }, + { + "epoch": 1.5892817922249067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.475520372390747, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7037403583526611, + "num_tokens": 374220021.0, + "step": 14472 + }, + { + "epoch": 1.5893916099275203, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2163069248199463, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6957321166992188, + "num_tokens": 374252238.0, + "step": 14473 + }, + { + "epoch": 1.589501427630134, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.329296350479126, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7057397365570068, + "num_tokens": 374279155.0, + "step": 14474 + }, + { + "epoch": 1.5896112453327476, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2434957027435303, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7105302214622498, + "num_tokens": 374308634.0, + "step": 14475 + }, + { + "epoch": 1.5897210630353613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5808298587799072, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7099940180778503, + "num_tokens": 374331359.0, + "step": 14476 + }, + { + "epoch": 1.589830880737975, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5864994525909424, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.685784101486206, + "num_tokens": 374355995.0, + "step": 14477 + }, + { + "epoch": 1.5899406984405886, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.589735984802246, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7137421369552612, + "num_tokens": 374377534.0, + "step": 14478 + }, + { + "epoch": 1.5900505161432021, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 32.08463668823242, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7151426076889038, + "num_tokens": 374403849.0, + "step": 14479 + }, + { + "epoch": 1.590160333845816, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.372544527053833, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7274960279464722, + "num_tokens": 374429684.0, + "step": 14480 + }, + { + "epoch": 1.5902701515484297, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3468611240386963, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7186864018440247, + "num_tokens": 374455969.0, + "step": 14481 + }, + { + "epoch": 1.5903799692510434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4086568355560303, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7118258476257324, + "num_tokens": 374481316.0, + "step": 14482 + }, + { + "epoch": 1.590489786953657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3319785594940186, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6957601308822632, + "num_tokens": 374505819.0, + "step": 14483 + }, + { + "epoch": 1.5905996046562705, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3058388233184814, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7450027465820312, + "num_tokens": 374529943.0, + "step": 14484 + }, + { + "epoch": 1.5907094223588842, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5921475887298584, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7195082902908325, + "num_tokens": 374551720.0, + "step": 14485 + }, + { + "epoch": 1.590819240061498, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.201012372970581, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.700827956199646, + "num_tokens": 374580820.0, + "step": 14486 + }, + { + "epoch": 1.5909290577641115, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.568451166152954, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.719666600227356, + "num_tokens": 374603548.0, + "step": 14487 + }, + { + "epoch": 1.5910388754667253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3728866577148438, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.7038266658782959, + "num_tokens": 374629400.0, + "step": 14488 + }, + { + "epoch": 1.5911486931693388, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.500506639480591, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7088417410850525, + "num_tokens": 374651432.0, + "step": 14489 + }, + { + "epoch": 1.5912585108719526, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0988221168518066, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.70531165599823, + "num_tokens": 374684310.0, + "step": 14490 + }, + { + "epoch": 1.5913683285745663, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3673672676086426, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6998554468154907, + "num_tokens": 374709123.0, + "step": 14491 + }, + { + "epoch": 1.5914781462771799, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2601094245910645, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7110652327537537, + "num_tokens": 374737090.0, + "step": 14492 + }, + { + "epoch": 1.5915879639797934, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3565876483917236, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7095034122467041, + "num_tokens": 374762696.0, + "step": 14493 + }, + { + "epoch": 1.5916977816824072, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4979913234710693, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7183676958084106, + "num_tokens": 374787539.0, + "step": 14494 + }, + { + "epoch": 1.591807599385021, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.324186325073242, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6994198560714722, + "num_tokens": 374815548.0, + "step": 14495 + }, + { + "epoch": 1.5919174170876347, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2983033657073975, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6993125677108765, + "num_tokens": 374842855.0, + "step": 14496 + }, + { + "epoch": 1.5920272347902482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.132603168487549, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7019888162612915, + "num_tokens": 374874927.0, + "step": 14497 + }, + { + "epoch": 1.5921370524928617, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.537517786026001, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7338263392448425, + "num_tokens": 374895034.0, + "step": 14498 + }, + { + "epoch": 1.5922468701954755, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4512648582458496, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7005041837692261, + "num_tokens": 374919496.0, + "step": 14499 + }, + { + "epoch": 1.5923566878980893, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.518599033355713, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7030639052391052, + "num_tokens": 374943244.0, + "step": 14500 + }, + { + "epoch": 1.5924665056007028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.304515838623047, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.705528736114502, + "num_tokens": 374971146.0, + "step": 14501 + }, + { + "epoch": 1.5925763233033163, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3204383850097656, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7165946960449219, + "num_tokens": 374997752.0, + "step": 14502 + }, + { + "epoch": 1.59268614100593, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4459850788116455, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6944422721862793, + "num_tokens": 375025536.0, + "step": 14503 + }, + { + "epoch": 1.5927959587085438, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4482386112213135, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.71274733543396, + "num_tokens": 375049295.0, + "step": 14504 + }, + { + "epoch": 1.5929057764111576, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.317235231399536, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7244538068771362, + "num_tokens": 375073356.0, + "step": 14505 + }, + { + "epoch": 1.5930155941137711, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4257538318634033, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7247107028961182, + "num_tokens": 375097520.0, + "step": 14506 + }, + { + "epoch": 1.5931254118163847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.271303176879883, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7066138982772827, + "num_tokens": 375124237.0, + "step": 14507 + }, + { + "epoch": 1.5932352295189984, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4610493183135986, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7397102117538452, + "num_tokens": 375148049.0, + "step": 14508 + }, + { + "epoch": 1.5933450472216122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3235905170440674, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7100580930709839, + "num_tokens": 375177807.0, + "step": 14509 + }, + { + "epoch": 1.593454864924226, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3742685317993164, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7288612127304077, + "num_tokens": 375201645.0, + "step": 14510 + }, + { + "epoch": 1.5935646826268395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2193124294281006, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7025710940361023, + "num_tokens": 375229559.0, + "step": 14511 + }, + { + "epoch": 1.593674500329453, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2665653228759766, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.702709436416626, + "num_tokens": 375259333.0, + "step": 14512 + }, + { + "epoch": 1.5937843180320668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2446703910827637, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6878197193145752, + "num_tokens": 375288550.0, + "step": 14513 + }, + { + "epoch": 1.5938941357346805, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.108569860458374, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7257104516029358, + "num_tokens": 375320715.0, + "step": 14514 + }, + { + "epoch": 1.594003953437294, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.292292594909668, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6864485740661621, + "num_tokens": 375350974.0, + "step": 14515 + }, + { + "epoch": 1.5941137711399076, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2121167182922363, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6988641619682312, + "num_tokens": 375380124.0, + "step": 14516 + }, + { + "epoch": 1.5942235888425214, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.506653070449829, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7293338775634766, + "num_tokens": 375403688.0, + "step": 14517 + }, + { + "epoch": 1.5943334065451351, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.695129632949829, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7196588516235352, + "num_tokens": 375424204.0, + "step": 14518 + }, + { + "epoch": 1.5944432242477489, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2033584117889404, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6890431046485901, + "num_tokens": 375454822.0, + "step": 14519 + }, + { + "epoch": 1.5945530419503624, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4780869483947754, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7030900120735168, + "num_tokens": 375481295.0, + "step": 14520 + }, + { + "epoch": 1.594662859652976, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4103128910064697, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7184654474258423, + "num_tokens": 375505553.0, + "step": 14521 + }, + { + "epoch": 1.5947726773555897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4292750358581543, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7119245529174805, + "num_tokens": 375529493.0, + "step": 14522 + }, + { + "epoch": 1.5948824950582035, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1301827430725098, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7095775604248047, + "num_tokens": 375560012.0, + "step": 14523 + }, + { + "epoch": 1.5949923127608172, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.414346933364868, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7130365371704102, + "num_tokens": 375584353.0, + "step": 14524 + }, + { + "epoch": 1.5951021304634307, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.245692014694214, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7171299457550049, + "num_tokens": 375610895.0, + "step": 14525 + }, + { + "epoch": 1.5952119481660443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5702624320983887, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7195401191711426, + "num_tokens": 375632870.0, + "step": 14526 + }, + { + "epoch": 1.595321765868658, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4212124347686768, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7008283138275146, + "num_tokens": 375658316.0, + "step": 14527 + }, + { + "epoch": 1.5954315835712718, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.280482053756714, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7026519775390625, + "num_tokens": 375684916.0, + "step": 14528 + }, + { + "epoch": 1.5955414012738853, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3199546337127686, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.709281861782074, + "num_tokens": 375711341.0, + "step": 14529 + }, + { + "epoch": 1.5956512189764989, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.331111431121826, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6924784183502197, + "num_tokens": 375738416.0, + "step": 14530 + }, + { + "epoch": 1.5957610366791126, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5509300231933594, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7332479953765869, + "num_tokens": 375760713.0, + "step": 14531 + }, + { + "epoch": 1.5958708543817264, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3942160606384277, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7353384494781494, + "num_tokens": 375786840.0, + "step": 14532 + }, + { + "epoch": 1.5959806720843401, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5275216102600098, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7088743448257446, + "num_tokens": 375811614.0, + "step": 14533 + }, + { + "epoch": 1.5960904897869537, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.388990879058838, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7089011669158936, + "num_tokens": 375838020.0, + "step": 14534 + }, + { + "epoch": 1.5962003074895672, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.398057222366333, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7107692360877991, + "num_tokens": 375861796.0, + "step": 14535 + }, + { + "epoch": 1.596310125192181, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3470730781555176, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7138325572013855, + "num_tokens": 375887156.0, + "step": 14536 + }, + { + "epoch": 1.5964199428947947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.300692558288574, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.73091721534729, + "num_tokens": 375911343.0, + "step": 14537 + }, + { + "epoch": 1.5965297605974083, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4578018188476562, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6892386078834534, + "num_tokens": 375936167.0, + "step": 14538 + }, + { + "epoch": 1.596639578300022, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1098382472991943, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7050421833992004, + "num_tokens": 375967462.0, + "step": 14539 + }, + { + "epoch": 1.5967493960026355, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4758803844451904, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7235926389694214, + "num_tokens": 375990868.0, + "step": 14540 + }, + { + "epoch": 1.5968592137052493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4184253215789795, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6958444714546204, + "num_tokens": 376016493.0, + "step": 14541 + }, + { + "epoch": 1.596969031407863, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.657444715499878, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7135968208312988, + "num_tokens": 376039072.0, + "step": 14542 + }, + { + "epoch": 1.5970788491104766, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2807977199554443, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.715869665145874, + "num_tokens": 376067183.0, + "step": 14543 + }, + { + "epoch": 1.5971886668130901, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5176455974578857, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7200604677200317, + "num_tokens": 376091007.0, + "step": 14544 + }, + { + "epoch": 1.5972984845157039, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6932053565979004, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7320274114608765, + "num_tokens": 376110858.0, + "step": 14545 + }, + { + "epoch": 1.5974083022183176, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.637228488922119, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7335753440856934, + "num_tokens": 376131065.0, + "step": 14546 + }, + { + "epoch": 1.5975181199209314, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3964085578918457, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7072224617004395, + "num_tokens": 376156691.0, + "step": 14547 + }, + { + "epoch": 1.597627937623545, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.481735944747925, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7010520696640015, + "num_tokens": 376180701.0, + "step": 14548 + }, + { + "epoch": 1.5977377553261585, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.334577798843384, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7003707885742188, + "num_tokens": 376209886.0, + "step": 14549 + }, + { + "epoch": 1.5978475730287722, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3742120265960693, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7256673574447632, + "num_tokens": 376234719.0, + "step": 14550 + }, + { + "epoch": 1.597957390731386, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.701467990875244, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7219750881195068, + "num_tokens": 376255215.0, + "step": 14551 + }, + { + "epoch": 1.5980672084339995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.317655324935913, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.710788369178772, + "num_tokens": 376280985.0, + "step": 14552 + }, + { + "epoch": 1.5981770261366133, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4787747859954834, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7123871445655823, + "num_tokens": 376305600.0, + "step": 14553 + }, + { + "epoch": 1.5982868438392268, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5406224727630615, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7093549370765686, + "num_tokens": 376329147.0, + "step": 14554 + }, + { + "epoch": 1.5983966615418406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.405447244644165, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7084089517593384, + "num_tokens": 376353320.0, + "step": 14555 + }, + { + "epoch": 1.5985064792444543, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.367852210998535, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6970005035400391, + "num_tokens": 376380025.0, + "step": 14556 + }, + { + "epoch": 1.5986162969470679, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.134425163269043, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7028505802154541, + "num_tokens": 376411452.0, + "step": 14557 + }, + { + "epoch": 1.5987261146496814, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1021366119384766, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6974796056747437, + "num_tokens": 376444923.0, + "step": 14558 + }, + { + "epoch": 1.5988359323522952, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5307531356811523, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7057689428329468, + "num_tokens": 376467832.0, + "step": 14559 + }, + { + "epoch": 1.598945750054909, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.365642786026001, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7076948881149292, + "num_tokens": 376492742.0, + "step": 14560 + }, + { + "epoch": 1.5990555677575227, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.162767171859741, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.712340772151947, + "num_tokens": 376525201.0, + "step": 14561 + }, + { + "epoch": 1.5991653854601362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4380104541778564, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7237077951431274, + "num_tokens": 376549642.0, + "step": 14562 + }, + { + "epoch": 1.5992752031627497, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.926475763320923, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7035607099533081, + "num_tokens": 376568606.0, + "step": 14563 + }, + { + "epoch": 1.5993850208653635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5055887699127197, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6938996315002441, + "num_tokens": 376593160.0, + "step": 14564 + }, + { + "epoch": 1.5994948385679773, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.450685977935791, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7325528860092163, + "num_tokens": 376615368.0, + "step": 14565 + }, + { + "epoch": 1.5996046562705908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.583364725112915, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6867952346801758, + "num_tokens": 376639264.0, + "step": 14566 + }, + { + "epoch": 1.5997144739732043, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2943992614746094, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.7007055878639221, + "num_tokens": 376669428.0, + "step": 14567 + }, + { + "epoch": 1.599824291675818, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2705111503601074, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.73136305809021, + "num_tokens": 376694719.0, + "step": 14568 + }, + { + "epoch": 1.5999341093784318, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.453756809234619, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.706932544708252, + "num_tokens": 376718948.0, + "step": 14569 + }, + { + "epoch": 1.6000439270810456, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2273061275482178, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7101384401321411, + "num_tokens": 376747031.0, + "step": 14570 + }, + { + "epoch": 1.6001537447836591, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4376370906829834, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7242913246154785, + "num_tokens": 376770942.0, + "step": 14571 + }, + { + "epoch": 1.6002635624862727, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2619848251342773, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7129384279251099, + "num_tokens": 376800528.0, + "step": 14572 + }, + { + "epoch": 1.6003733801888864, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0982565879821777, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7061271667480469, + "num_tokens": 376833529.0, + "step": 14573 + }, + { + "epoch": 1.6004831978915002, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5435402393341064, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7149996757507324, + "num_tokens": 376856645.0, + "step": 14574 + }, + { + "epoch": 1.600593015594114, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.77707576751709, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7119408845901489, + "num_tokens": 376888611.0, + "step": 14575 + }, + { + "epoch": 1.6007028332967275, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4427897930145264, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7089366316795349, + "num_tokens": 376912936.0, + "step": 14576 + }, + { + "epoch": 1.600812650999341, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4132091999053955, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6974735856056213, + "num_tokens": 376937269.0, + "step": 14577 + }, + { + "epoch": 1.6009224687019548, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.510402202606201, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7170530557632446, + "num_tokens": 376960702.0, + "step": 14578 + }, + { + "epoch": 1.6010322864045685, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.189342975616455, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7181740999221802, + "num_tokens": 376988619.0, + "step": 14579 + }, + { + "epoch": 1.601142104107182, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.186837911605835, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6808308959007263, + "num_tokens": 377020953.0, + "step": 14580 + }, + { + "epoch": 1.6012519218097956, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.527337074279785, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7171694040298462, + "num_tokens": 377044028.0, + "step": 14581 + }, + { + "epoch": 1.6013617395124093, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2578928470611572, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7051624059677124, + "num_tokens": 377076356.0, + "step": 14582 + }, + { + "epoch": 1.601471557215023, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.355581045150757, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7167329788208008, + "num_tokens": 377100286.0, + "step": 14583 + }, + { + "epoch": 1.6015813749176369, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.17443585395813, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7202000617980957, + "num_tokens": 377129006.0, + "step": 14584 + }, + { + "epoch": 1.6016911926202504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7893290519714355, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7298986315727234, + "num_tokens": 377147317.0, + "step": 14585 + }, + { + "epoch": 1.601801010322864, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2880566120147705, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7024667263031006, + "num_tokens": 377174880.0, + "step": 14586 + }, + { + "epoch": 1.6019108280254777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.591583013534546, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7078771591186523, + "num_tokens": 377197324.0, + "step": 14587 + }, + { + "epoch": 1.6020206457280914, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6849899291992188, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7034639716148376, + "num_tokens": 377218568.0, + "step": 14588 + }, + { + "epoch": 1.602130463430705, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.461129665374756, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.712782621383667, + "num_tokens": 377242710.0, + "step": 14589 + }, + { + "epoch": 1.6022402811333187, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0193560123443604, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.707838773727417, + "num_tokens": 377282627.0, + "step": 14590 + }, + { + "epoch": 1.6023500988359323, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.561847686767578, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7236853241920471, + "num_tokens": 377304607.0, + "step": 14591 + }, + { + "epoch": 1.602459916538546, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.704895257949829, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7292708158493042, + "num_tokens": 377324519.0, + "step": 14592 + }, + { + "epoch": 1.6025697342411598, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.084571599960327, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6810987591743469, + "num_tokens": 377360265.0, + "step": 14593 + }, + { + "epoch": 1.6026795519437733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.0439860820770264, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7104161977767944, + "num_tokens": 377377975.0, + "step": 14594 + }, + { + "epoch": 1.6027893696463869, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.270989179611206, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.7139003276824951, + "num_tokens": 377407738.0, + "step": 14595 + }, + { + "epoch": 1.6028991873490006, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5246033668518066, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.711966335773468, + "num_tokens": 377429190.0, + "step": 14596 + }, + { + "epoch": 1.6030090050516144, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5347819328308105, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7331929206848145, + "num_tokens": 377451639.0, + "step": 14597 + }, + { + "epoch": 1.6031188227542281, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.559978723526001, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7010314464569092, + "num_tokens": 377477193.0, + "step": 14598 + }, + { + "epoch": 1.6032286404568417, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.550814628601074, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7158639430999756, + "num_tokens": 377503375.0, + "step": 14599 + }, + { + "epoch": 1.6033384581594552, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7222955226898193, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.721125066280365, + "num_tokens": 377523637.0, + "step": 14600 + }, + { + "epoch": 1.603448275862069, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4140233993530273, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6902897357940674, + "num_tokens": 377553443.0, + "step": 14601 + }, + { + "epoch": 1.6035580935646827, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.22151780128479, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7464638948440552, + "num_tokens": 377582274.0, + "step": 14602 + }, + { + "epoch": 1.6036679112672962, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7306981086730957, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.700250506401062, + "num_tokens": 377604082.0, + "step": 14603 + }, + { + "epoch": 1.60377772896991, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.322694778442383, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7313483953475952, + "num_tokens": 377629865.0, + "step": 14604 + }, + { + "epoch": 1.6038875466725235, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.757446765899658, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7291153073310852, + "num_tokens": 377651745.0, + "step": 14605 + }, + { + "epoch": 1.6039973643751373, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.322941541671753, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6948499083518982, + "num_tokens": 377683487.0, + "step": 14606 + }, + { + "epoch": 1.604107182077751, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.404135227203369, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.690194308757782, + "num_tokens": 377708915.0, + "step": 14607 + }, + { + "epoch": 1.6042169997803646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3297042846679688, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7111859917640686, + "num_tokens": 377733857.0, + "step": 14608 + }, + { + "epoch": 1.6043268174829781, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7195608615875244, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7368084192276001, + "num_tokens": 377752263.0, + "step": 14609 + }, + { + "epoch": 1.6044366351855919, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.452162981033325, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7098081111907959, + "num_tokens": 377775949.0, + "step": 14610 + }, + { + "epoch": 1.6045464528882056, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7308948040008545, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7258835434913635, + "num_tokens": 377797937.0, + "step": 14611 + }, + { + "epoch": 1.6046562705908194, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3373727798461914, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7150497436523438, + "num_tokens": 377828982.0, + "step": 14612 + }, + { + "epoch": 1.604766088293433, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.039551019668579, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6941439509391785, + "num_tokens": 377863360.0, + "step": 14613 + }, + { + "epoch": 1.6048759059960465, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.23983097076416, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6934475898742676, + "num_tokens": 377891909.0, + "step": 14614 + }, + { + "epoch": 1.6049857236986602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.397052764892578, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7124160528182983, + "num_tokens": 377916869.0, + "step": 14615 + }, + { + "epoch": 1.605095541401274, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.516397714614868, + "learning_rate": 1e-06, + "loss": 1.092, + "mean_token_accuracy": 0.6809173822402954, + "num_tokens": 377941425.0, + "step": 14616 + }, + { + "epoch": 1.6052053591038875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4359214305877686, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7265461683273315, + "num_tokens": 377966567.0, + "step": 14617 + }, + { + "epoch": 1.605315176806501, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.701127290725708, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6868225932121277, + "num_tokens": 377993994.0, + "step": 14618 + }, + { + "epoch": 1.6054249945091148, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.476729154586792, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7141243815422058, + "num_tokens": 378018392.0, + "step": 14619 + }, + { + "epoch": 1.6055348122117286, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.701861619949341, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7483946084976196, + "num_tokens": 378038143.0, + "step": 14620 + }, + { + "epoch": 1.6056446299143423, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4270527362823486, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.7007750868797302, + "num_tokens": 378063431.0, + "step": 14621 + }, + { + "epoch": 1.6057544476169558, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5113518238067627, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7389056086540222, + "num_tokens": 378086320.0, + "step": 14622 + }, + { + "epoch": 1.6058642653195694, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7885942459106445, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7101061344146729, + "num_tokens": 378107634.0, + "step": 14623 + }, + { + "epoch": 1.6059740830221831, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.88619327545166, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7149379253387451, + "num_tokens": 378134425.0, + "step": 14624 + }, + { + "epoch": 1.606083900724797, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2362425327301025, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6899716258049011, + "num_tokens": 378163654.0, + "step": 14625 + }, + { + "epoch": 1.6061937184274107, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3977627754211426, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7108332514762878, + "num_tokens": 378190317.0, + "step": 14626 + }, + { + "epoch": 1.6063035361300242, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.250971555709839, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6825494766235352, + "num_tokens": 378219657.0, + "step": 14627 + }, + { + "epoch": 1.6064133538326377, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6998393535614014, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7169540524482727, + "num_tokens": 378239882.0, + "step": 14628 + }, + { + "epoch": 1.6065231715352515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.297424554824829, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6854555010795593, + "num_tokens": 378267995.0, + "step": 14629 + }, + { + "epoch": 1.6066329892378652, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.513244152069092, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.7023013830184937, + "num_tokens": 378291931.0, + "step": 14630 + }, + { + "epoch": 1.6067428069404788, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5749826431274414, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7038506865501404, + "num_tokens": 378315418.0, + "step": 14631 + }, + { + "epoch": 1.6068526246430923, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5263445377349854, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.719940185546875, + "num_tokens": 378336854.0, + "step": 14632 + }, + { + "epoch": 1.606962442345706, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2219736576080322, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7088358402252197, + "num_tokens": 378365320.0, + "step": 14633 + }, + { + "epoch": 1.6070722600483198, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.747603178024292, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7198989391326904, + "num_tokens": 378387196.0, + "step": 14634 + }, + { + "epoch": 1.6071820777509336, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7427632808685303, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7275360822677612, + "num_tokens": 378406099.0, + "step": 14635 + }, + { + "epoch": 1.6072918954535471, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2222604751586914, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6887081861495972, + "num_tokens": 378434516.0, + "step": 14636 + }, + { + "epoch": 1.6074017131561606, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1934618949890137, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7235938906669617, + "num_tokens": 378463211.0, + "step": 14637 + }, + { + "epoch": 1.6075115308587744, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.49617600440979, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7004285454750061, + "num_tokens": 378487206.0, + "step": 14638 + }, + { + "epoch": 1.6076213485613882, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.542834758758545, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7090393304824829, + "num_tokens": 378509810.0, + "step": 14639 + }, + { + "epoch": 1.607731166264002, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.498171091079712, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7011855840682983, + "num_tokens": 378535236.0, + "step": 14640 + }, + { + "epoch": 1.6078409839666155, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3491878509521484, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7085418105125427, + "num_tokens": 378564805.0, + "step": 14641 + }, + { + "epoch": 1.607950801669229, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.382751941680908, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7048289775848389, + "num_tokens": 378590294.0, + "step": 14642 + }, + { + "epoch": 1.6080606193718427, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3344976902008057, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6854922771453857, + "num_tokens": 378616209.0, + "step": 14643 + }, + { + "epoch": 1.6081704370744565, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 4.464756011962891, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6977068781852722, + "num_tokens": 378642574.0, + "step": 14644 + }, + { + "epoch": 1.60828025477707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.438713788986206, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7066103219985962, + "num_tokens": 378667975.0, + "step": 14645 + }, + { + "epoch": 1.6083900724796836, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2390129566192627, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7228730916976929, + "num_tokens": 378697179.0, + "step": 14646 + }, + { + "epoch": 1.6084998901822973, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7142417430877686, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7157293558120728, + "num_tokens": 378718268.0, + "step": 14647 + }, + { + "epoch": 1.608609707884911, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.452327251434326, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7063509821891785, + "num_tokens": 378745182.0, + "step": 14648 + }, + { + "epoch": 1.6087195255875248, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4448587894439697, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7251913547515869, + "num_tokens": 378770063.0, + "step": 14649 + }, + { + "epoch": 1.6088293432901384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4317784309387207, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7255500555038452, + "num_tokens": 378794104.0, + "step": 14650 + }, + { + "epoch": 1.608939160992752, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.302457809448242, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7084628939628601, + "num_tokens": 378820354.0, + "step": 14651 + }, + { + "epoch": 1.6090489786953657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.36757230758667, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7069013118743896, + "num_tokens": 378847049.0, + "step": 14652 + }, + { + "epoch": 1.6091587963979794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.401528835296631, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7282511591911316, + "num_tokens": 378871601.0, + "step": 14653 + }, + { + "epoch": 1.609268614100593, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.40405535697937, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7244171500205994, + "num_tokens": 378895536.0, + "step": 14654 + }, + { + "epoch": 1.6093784318032067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2520811557769775, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.708167552947998, + "num_tokens": 378923499.0, + "step": 14655 + }, + { + "epoch": 1.6094882495058203, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2079763412475586, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7027486562728882, + "num_tokens": 378952595.0, + "step": 14656 + }, + { + "epoch": 1.609598067208434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.285862684249878, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.695494532585144, + "num_tokens": 378981185.0, + "step": 14657 + }, + { + "epoch": 1.6097078849110478, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4367430210113525, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.7001181840896606, + "num_tokens": 379009117.0, + "step": 14658 + }, + { + "epoch": 1.6098177026136613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7650978565216064, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.726273238658905, + "num_tokens": 379028374.0, + "step": 14659 + }, + { + "epoch": 1.6099275203162748, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2109532356262207, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6878597140312195, + "num_tokens": 379058265.0, + "step": 14660 + }, + { + "epoch": 1.6100373380188886, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5621256828308105, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7080953121185303, + "num_tokens": 379078961.0, + "step": 14661 + }, + { + "epoch": 1.6101471557215024, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.406770706176758, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7134498357772827, + "num_tokens": 379104371.0, + "step": 14662 + }, + { + "epoch": 1.6102569734241161, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6547882556915283, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7196815013885498, + "num_tokens": 379125368.0, + "step": 14663 + }, + { + "epoch": 1.6103667911267296, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3775832653045654, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7080353498458862, + "num_tokens": 379148661.0, + "step": 14664 + }, + { + "epoch": 1.6104766088293432, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1724891662597656, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6988437175750732, + "num_tokens": 379180087.0, + "step": 14665 + }, + { + "epoch": 1.610586426531957, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3755860328674316, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6866507530212402, + "num_tokens": 379208009.0, + "step": 14666 + }, + { + "epoch": 1.6106962442345707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5587878227233887, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7078825235366821, + "num_tokens": 379229757.0, + "step": 14667 + }, + { + "epoch": 1.6108060619371842, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.485687017440796, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7267149686813354, + "num_tokens": 379252466.0, + "step": 14668 + }, + { + "epoch": 1.610915879639798, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2339088916778564, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7211601138114929, + "num_tokens": 379281949.0, + "step": 14669 + }, + { + "epoch": 1.6110256973424115, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1853907108306885, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6873383522033691, + "num_tokens": 379314100.0, + "step": 14670 + }, + { + "epoch": 1.6111355150450253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.295295000076294, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7120814323425293, + "num_tokens": 379341840.0, + "step": 14671 + }, + { + "epoch": 1.611245332747639, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.340769052505493, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7210301160812378, + "num_tokens": 379368847.0, + "step": 14672 + }, + { + "epoch": 1.6113551504502526, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.597010850906372, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7268224954605103, + "num_tokens": 379391704.0, + "step": 14673 + }, + { + "epoch": 1.611464968152866, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5540363788604736, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7212661504745483, + "num_tokens": 379415996.0, + "step": 14674 + }, + { + "epoch": 1.6115747858554799, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1630172729492188, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7179766893386841, + "num_tokens": 379445398.0, + "step": 14675 + }, + { + "epoch": 1.6116846035580936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3001749515533447, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7137803435325623, + "num_tokens": 379473108.0, + "step": 14676 + }, + { + "epoch": 1.6117944212607074, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4564154148101807, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7184632420539856, + "num_tokens": 379497755.0, + "step": 14677 + }, + { + "epoch": 1.611904238963321, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6118414402008057, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7185584902763367, + "num_tokens": 379520189.0, + "step": 14678 + }, + { + "epoch": 1.6120140566659344, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5349011421203613, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7050457000732422, + "num_tokens": 379543320.0, + "step": 14679 + }, + { + "epoch": 1.6121238743685482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3848516941070557, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7009716629981995, + "num_tokens": 379566938.0, + "step": 14680 + }, + { + "epoch": 1.612233692071162, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3913798332214355, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7109757661819458, + "num_tokens": 379591567.0, + "step": 14681 + }, + { + "epoch": 1.6123435097737755, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3202383518218994, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7221083641052246, + "num_tokens": 379617964.0, + "step": 14682 + }, + { + "epoch": 1.612453327476389, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3179097175598145, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7076461315155029, + "num_tokens": 379645750.0, + "step": 14683 + }, + { + "epoch": 1.6125631451790028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4902236461639404, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7308763265609741, + "num_tokens": 379669620.0, + "step": 14684 + }, + { + "epoch": 1.6126729628816165, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.313016414642334, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7083907127380371, + "num_tokens": 379695559.0, + "step": 14685 + }, + { + "epoch": 1.6127827805842303, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.549765110015869, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7071252465248108, + "num_tokens": 379719060.0, + "step": 14686 + }, + { + "epoch": 1.6128925982868438, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.357645034790039, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6938250064849854, + "num_tokens": 379746301.0, + "step": 14687 + }, + { + "epoch": 1.6130024159894574, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.264016628265381, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.7002254724502563, + "num_tokens": 379774412.0, + "step": 14688 + }, + { + "epoch": 1.6131122336920711, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.302544593811035, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6898486614227295, + "num_tokens": 379802433.0, + "step": 14689 + }, + { + "epoch": 1.6132220513946849, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.094109296798706, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7035157680511475, + "num_tokens": 379834583.0, + "step": 14690 + }, + { + "epoch": 1.6133318690972986, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8731114864349365, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7122626304626465, + "num_tokens": 379854369.0, + "step": 14691 + }, + { + "epoch": 1.6134416867999122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.337449550628662, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7085044980049133, + "num_tokens": 379880685.0, + "step": 14692 + }, + { + "epoch": 1.6135515045025257, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.257599115371704, + "learning_rate": 1e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6800262928009033, + "num_tokens": 379910731.0, + "step": 14693 + }, + { + "epoch": 1.6136613222051395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4111883640289307, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7190066576004028, + "num_tokens": 379934116.0, + "step": 14694 + }, + { + "epoch": 1.6137711399077532, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.420073986053467, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7199506163597107, + "num_tokens": 379960684.0, + "step": 14695 + }, + { + "epoch": 1.6138809576103668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.435213088989258, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6980622410774231, + "num_tokens": 379987816.0, + "step": 14696 + }, + { + "epoch": 1.6139907753129803, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1723124980926514, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7109106183052063, + "num_tokens": 380017144.0, + "step": 14697 + }, + { + "epoch": 1.614100593015594, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.483160972595215, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7167053818702698, + "num_tokens": 380039497.0, + "step": 14698 + }, + { + "epoch": 1.6142104107182078, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.182799816131592, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7090530395507812, + "num_tokens": 380068009.0, + "step": 14699 + }, + { + "epoch": 1.6143202284208216, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2998664379119873, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6992111206054688, + "num_tokens": 380101244.0, + "step": 14700 + }, + { + "epoch": 1.614430046123435, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3342092037200928, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7254341840744019, + "num_tokens": 380129242.0, + "step": 14701 + }, + { + "epoch": 1.6145398638260486, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5963730812072754, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6963423490524292, + "num_tokens": 380151774.0, + "step": 14702 + }, + { + "epoch": 1.6146496815286624, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.218106746673584, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7261114716529846, + "num_tokens": 380178855.0, + "step": 14703 + }, + { + "epoch": 1.6147594992312762, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5157294273376465, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7135234475135803, + "num_tokens": 380200391.0, + "step": 14704 + }, + { + "epoch": 1.61486931693389, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.181312322616577, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6946554183959961, + "num_tokens": 380231561.0, + "step": 14705 + }, + { + "epoch": 1.6149791346365034, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4139041900634766, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7256060838699341, + "num_tokens": 380255188.0, + "step": 14706 + }, + { + "epoch": 1.615088952339117, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5304245948791504, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7140485048294067, + "num_tokens": 380278009.0, + "step": 14707 + }, + { + "epoch": 1.6151987700417307, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.242982864379883, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6861664056777954, + "num_tokens": 380311382.0, + "step": 14708 + }, + { + "epoch": 1.6153085877443445, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0366861820220947, + "learning_rate": 1e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.6796147227287292, + "num_tokens": 380348900.0, + "step": 14709 + }, + { + "epoch": 1.615418405446958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4702935218811035, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7290288209915161, + "num_tokens": 380373273.0, + "step": 14710 + }, + { + "epoch": 1.6155282231495716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.120469331741333, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7177444100379944, + "num_tokens": 380404588.0, + "step": 14711 + }, + { + "epoch": 1.6156380408521853, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1740803718566895, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6943334341049194, + "num_tokens": 380435705.0, + "step": 14712 + }, + { + "epoch": 1.615747858554799, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.476008892059326, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7008799314498901, + "num_tokens": 380459583.0, + "step": 14713 + }, + { + "epoch": 1.6158576762574128, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.464555501937866, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7383627891540527, + "num_tokens": 380483648.0, + "step": 14714 + }, + { + "epoch": 1.6159674939600264, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.396704912185669, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6909346580505371, + "num_tokens": 380510835.0, + "step": 14715 + }, + { + "epoch": 1.61607731166264, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.574444532394409, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.700295627117157, + "num_tokens": 380534479.0, + "step": 14716 + }, + { + "epoch": 1.6161871293652537, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.415630340576172, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7056854367256165, + "num_tokens": 380558985.0, + "step": 14717 + }, + { + "epoch": 1.6162969470678674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.540700912475586, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7031417489051819, + "num_tokens": 380582954.0, + "step": 14718 + }, + { + "epoch": 1.616406764770481, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.26842999458313, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6995744705200195, + "num_tokens": 380609904.0, + "step": 14719 + }, + { + "epoch": 1.6165165824730947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 4.566214084625244, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7058273553848267, + "num_tokens": 380633853.0, + "step": 14720 + }, + { + "epoch": 1.6166264001757082, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4793553352355957, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7083438634872437, + "num_tokens": 380658581.0, + "step": 14721 + }, + { + "epoch": 1.616736217878322, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.285541296005249, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7240010499954224, + "num_tokens": 380684309.0, + "step": 14722 + }, + { + "epoch": 1.6168460355809358, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3602700233459473, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7009767293930054, + "num_tokens": 380714010.0, + "step": 14723 + }, + { + "epoch": 1.6169558532835493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.415759563446045, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7577069997787476, + "num_tokens": 380736775.0, + "step": 14724 + }, + { + "epoch": 1.6170656709861628, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5256786346435547, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7334814071655273, + "num_tokens": 380758733.0, + "step": 14725 + }, + { + "epoch": 1.6171754886887766, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.59405255317688, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.722806453704834, + "num_tokens": 380781061.0, + "step": 14726 + }, + { + "epoch": 1.6172853063913903, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7518606185913086, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7393501996994019, + "num_tokens": 380800537.0, + "step": 14727 + }, + { + "epoch": 1.617395124094004, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.685500144958496, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.70308917760849, + "num_tokens": 380822551.0, + "step": 14728 + }, + { + "epoch": 1.6175049417966176, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.245318651199341, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.683232843875885, + "num_tokens": 380850508.0, + "step": 14729 + }, + { + "epoch": 1.6176147594992312, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.424417734146118, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7067071199417114, + "num_tokens": 380873596.0, + "step": 14730 + }, + { + "epoch": 1.617724577201845, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3535995483398438, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7068319320678711, + "num_tokens": 380900552.0, + "step": 14731 + }, + { + "epoch": 1.6178343949044587, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3056392669677734, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.715222179889679, + "num_tokens": 380927255.0, + "step": 14732 + }, + { + "epoch": 1.6179442126070722, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4838461875915527, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6956675052642822, + "num_tokens": 380952323.0, + "step": 14733 + }, + { + "epoch": 1.618054030309686, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5502543449401855, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6849526762962341, + "num_tokens": 380978041.0, + "step": 14734 + }, + { + "epoch": 1.6181638480122995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5059335231781006, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7346129417419434, + "num_tokens": 380998558.0, + "step": 14735 + }, + { + "epoch": 1.6182736657149133, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1763088703155518, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6967138051986694, + "num_tokens": 381029442.0, + "step": 14736 + }, + { + "epoch": 1.618383483417527, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4445176124572754, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7258659601211548, + "num_tokens": 381053022.0, + "step": 14737 + }, + { + "epoch": 1.6184933011201406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5398623943328857, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6944741010665894, + "num_tokens": 381078358.0, + "step": 14738 + }, + { + "epoch": 1.618603118822754, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2847747802734375, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7110670804977417, + "num_tokens": 381105573.0, + "step": 14739 + }, + { + "epoch": 1.6187129365253679, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2351324558258057, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7048472166061401, + "num_tokens": 381136566.0, + "step": 14740 + }, + { + "epoch": 1.6188227542279816, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.488201141357422, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7058553695678711, + "num_tokens": 381159490.0, + "step": 14741 + }, + { + "epoch": 1.6189325719305954, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.270541191101074, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7297589778900146, + "num_tokens": 381184986.0, + "step": 14742 + }, + { + "epoch": 1.619042389633209, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2701640129089355, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7114802598953247, + "num_tokens": 381210746.0, + "step": 14743 + }, + { + "epoch": 1.6191522073358224, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.181529998779297, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6997334957122803, + "num_tokens": 381241557.0, + "step": 14744 + }, + { + "epoch": 1.6192620250384362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3884575366973877, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7362399101257324, + "num_tokens": 381266486.0, + "step": 14745 + }, + { + "epoch": 1.61937184274105, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.419024705886841, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7247465252876282, + "num_tokens": 381292874.0, + "step": 14746 + }, + { + "epoch": 1.6194816604436635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.278423309326172, + "learning_rate": 1e-06, + "loss": 1.1158, + "mean_token_accuracy": 0.6868735551834106, + "num_tokens": 381323286.0, + "step": 14747 + }, + { + "epoch": 1.619591478146277, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.548907995223999, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7208783626556396, + "num_tokens": 381344594.0, + "step": 14748 + }, + { + "epoch": 1.6197012958488908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3830173015594482, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7123522758483887, + "num_tokens": 381371462.0, + "step": 14749 + }, + { + "epoch": 1.6198111135515045, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.457547426223755, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.691356897354126, + "num_tokens": 381397383.0, + "step": 14750 + }, + { + "epoch": 1.6199209312541183, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1465165615081787, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7121074199676514, + "num_tokens": 381426112.0, + "step": 14751 + }, + { + "epoch": 1.6200307489567318, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.424975633621216, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.700275719165802, + "num_tokens": 381452303.0, + "step": 14752 + }, + { + "epoch": 1.6201405666593454, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5527849197387695, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6989936828613281, + "num_tokens": 381476583.0, + "step": 14753 + }, + { + "epoch": 1.6202503843619591, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2043282985687256, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6994341015815735, + "num_tokens": 381509030.0, + "step": 14754 + }, + { + "epoch": 1.6203602020645729, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.739877700805664, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7416902780532837, + "num_tokens": 381529557.0, + "step": 14755 + }, + { + "epoch": 1.6204700197671866, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2583887577056885, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6920062303543091, + "num_tokens": 381558586.0, + "step": 14756 + }, + { + "epoch": 1.6205798374698002, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3052260875701904, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6863300800323486, + "num_tokens": 381587177.0, + "step": 14757 + }, + { + "epoch": 1.6206896551724137, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.342880964279175, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6989949941635132, + "num_tokens": 381617923.0, + "step": 14758 + }, + { + "epoch": 1.6207994728750275, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.370997905731201, + "learning_rate": 1e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6978869438171387, + "num_tokens": 381647988.0, + "step": 14759 + }, + { + "epoch": 1.6209092905776412, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.443315267562866, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7042402029037476, + "num_tokens": 381671221.0, + "step": 14760 + }, + { + "epoch": 1.6210191082802548, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.350851058959961, + "learning_rate": 1e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6912053823471069, + "num_tokens": 381698408.0, + "step": 14761 + }, + { + "epoch": 1.6211289259828683, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.727755069732666, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7202311158180237, + "num_tokens": 381717742.0, + "step": 14762 + }, + { + "epoch": 1.621238743685482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2525634765625, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7140544056892395, + "num_tokens": 381748107.0, + "step": 14763 + }, + { + "epoch": 1.6213485613880958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7422821521759033, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7208143472671509, + "num_tokens": 381770856.0, + "step": 14764 + }, + { + "epoch": 1.6214583790907096, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3478360176086426, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7067041397094727, + "num_tokens": 381799450.0, + "step": 14765 + }, + { + "epoch": 1.621568196793323, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.384706735610962, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7042100429534912, + "num_tokens": 381825078.0, + "step": 14766 + }, + { + "epoch": 1.6216780144959366, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2937135696411133, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6996502876281738, + "num_tokens": 381853370.0, + "step": 14767 + }, + { + "epoch": 1.6217878321985504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2471120357513428, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7148542404174805, + "num_tokens": 381880615.0, + "step": 14768 + }, + { + "epoch": 1.6218976499011641, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4854514598846436, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.708350658416748, + "num_tokens": 381904268.0, + "step": 14769 + }, + { + "epoch": 1.6220074676037777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4401919841766357, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7035872936248779, + "num_tokens": 381928517.0, + "step": 14770 + }, + { + "epoch": 1.6221172853063914, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6761136054992676, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7108728885650635, + "num_tokens": 381951430.0, + "step": 14771 + }, + { + "epoch": 1.622227103009005, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4261159896850586, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7132710814476013, + "num_tokens": 381976894.0, + "step": 14772 + }, + { + "epoch": 1.6223369207116187, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.463536262512207, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7257018089294434, + "num_tokens": 382002349.0, + "step": 14773 + }, + { + "epoch": 1.6224467384142325, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.564183235168457, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7058473229408264, + "num_tokens": 382025309.0, + "step": 14774 + }, + { + "epoch": 1.622556556116846, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.470102310180664, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7039122581481934, + "num_tokens": 382047531.0, + "step": 14775 + }, + { + "epoch": 1.6226663738194596, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6025259494781494, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.688128650188446, + "num_tokens": 382070872.0, + "step": 14776 + }, + { + "epoch": 1.6227761915220733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2442119121551514, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7118293046951294, + "num_tokens": 382100230.0, + "step": 14777 + }, + { + "epoch": 1.622886009224687, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1545498371124268, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7406942248344421, + "num_tokens": 382129822.0, + "step": 14778 + }, + { + "epoch": 1.6229958269273008, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3852596282958984, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7281460762023926, + "num_tokens": 382155243.0, + "step": 14779 + }, + { + "epoch": 1.6231056446299144, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4408226013183594, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7286247611045837, + "num_tokens": 382179428.0, + "step": 14780 + }, + { + "epoch": 1.623215462332528, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.363967180252075, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7067698240280151, + "num_tokens": 382205016.0, + "step": 14781 + }, + { + "epoch": 1.6233252800351416, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1415622234344482, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6921491026878357, + "num_tokens": 382236986.0, + "step": 14782 + }, + { + "epoch": 1.6234350977377554, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.155770778656006, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6952121257781982, + "num_tokens": 382268483.0, + "step": 14783 + }, + { + "epoch": 1.623544915440369, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.409966230392456, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7050190567970276, + "num_tokens": 382294735.0, + "step": 14784 + }, + { + "epoch": 1.6236547331429827, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2779102325439453, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7405016422271729, + "num_tokens": 382319364.0, + "step": 14785 + }, + { + "epoch": 1.6237645508455962, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3616161346435547, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6995478868484497, + "num_tokens": 382343621.0, + "step": 14786 + }, + { + "epoch": 1.62387436854821, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4851462841033936, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6999242305755615, + "num_tokens": 382371121.0, + "step": 14787 + }, + { + "epoch": 1.6239841862508237, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4511139392852783, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7039362192153931, + "num_tokens": 382397503.0, + "step": 14788 + }, + { + "epoch": 1.6240940039534373, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3164408206939697, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6908929944038391, + "num_tokens": 382424574.0, + "step": 14789 + }, + { + "epoch": 1.6242038216560508, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2390530109405518, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7040013074874878, + "num_tokens": 382450711.0, + "step": 14790 + }, + { + "epoch": 1.6243136393586646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.333984375, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.712529182434082, + "num_tokens": 382476821.0, + "step": 14791 + }, + { + "epoch": 1.6244234570612783, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.448852300643921, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7070839405059814, + "num_tokens": 382503988.0, + "step": 14792 + }, + { + "epoch": 1.624533274763892, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2057275772094727, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7009835243225098, + "num_tokens": 382533558.0, + "step": 14793 + }, + { + "epoch": 1.6246430924665056, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.609104633331299, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7063819766044617, + "num_tokens": 382557276.0, + "step": 14794 + }, + { + "epoch": 1.6247529101691192, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.496596574783325, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6958664655685425, + "num_tokens": 382582157.0, + "step": 14795 + }, + { + "epoch": 1.624862727871733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.187021017074585, + "learning_rate": 1e-06, + "loss": 1.1205, + "mean_token_accuracy": 0.6899566650390625, + "num_tokens": 382612789.0, + "step": 14796 + }, + { + "epoch": 1.6249725455743467, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.136409282684326, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.69425368309021, + "num_tokens": 382646381.0, + "step": 14797 + }, + { + "epoch": 1.6250823632769602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4744560718536377, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.708516538143158, + "num_tokens": 382670294.0, + "step": 14798 + }, + { + "epoch": 1.6251921809795737, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3931186199188232, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7228596210479736, + "num_tokens": 382694925.0, + "step": 14799 + }, + { + "epoch": 1.6253019986821875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.300936460494995, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6951494812965393, + "num_tokens": 382724352.0, + "step": 14800 + }, + { + "epoch": 1.6254118163848013, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.309335470199585, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7136049270629883, + "num_tokens": 382751825.0, + "step": 14801 + }, + { + "epoch": 1.625521634087415, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3475804328918457, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7133063077926636, + "num_tokens": 382777700.0, + "step": 14802 + }, + { + "epoch": 1.6256314517900285, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6122419834136963, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7049944400787354, + "num_tokens": 382801147.0, + "step": 14803 + }, + { + "epoch": 1.625741269492642, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.627894163131714, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6997179388999939, + "num_tokens": 382823777.0, + "step": 14804 + }, + { + "epoch": 1.6258510871952558, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.329315423965454, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7061189413070679, + "num_tokens": 382851424.0, + "step": 14805 + }, + { + "epoch": 1.6259609048978696, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2997183799743652, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7262920141220093, + "num_tokens": 382879282.0, + "step": 14806 + }, + { + "epoch": 1.6260707226004834, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4401097297668457, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6991421580314636, + "num_tokens": 382905436.0, + "step": 14807 + }, + { + "epoch": 1.6261805403030969, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.310131549835205, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7041041851043701, + "num_tokens": 382933757.0, + "step": 14808 + }, + { + "epoch": 1.6262903580057104, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4822885990142822, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7032318115234375, + "num_tokens": 382956239.0, + "step": 14809 + }, + { + "epoch": 1.6264001757083242, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2353272438049316, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7119341492652893, + "num_tokens": 382988165.0, + "step": 14810 + }, + { + "epoch": 1.626509993410938, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.451246500015259, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7050973773002625, + "num_tokens": 383014012.0, + "step": 14811 + }, + { + "epoch": 1.6266198111135515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5226030349731445, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6995450258255005, + "num_tokens": 383039558.0, + "step": 14812 + }, + { + "epoch": 1.626729628816165, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.204977035522461, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.699228823184967, + "num_tokens": 383068124.0, + "step": 14813 + }, + { + "epoch": 1.6268394465187788, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5033304691314697, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7191779017448425, + "num_tokens": 383091127.0, + "step": 14814 + }, + { + "epoch": 1.6269492642213925, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3152339458465576, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7128883600234985, + "num_tokens": 383117348.0, + "step": 14815 + }, + { + "epoch": 1.6270590819240063, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.254551887512207, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7184831500053406, + "num_tokens": 383144594.0, + "step": 14816 + }, + { + "epoch": 1.6271688996266198, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.414660930633545, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6996729373931885, + "num_tokens": 383173211.0, + "step": 14817 + }, + { + "epoch": 1.6272787173292333, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4784698486328125, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7015047669410706, + "num_tokens": 383196574.0, + "step": 14818 + }, + { + "epoch": 1.627388535031847, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.371954917907715, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7036576271057129, + "num_tokens": 383225233.0, + "step": 14819 + }, + { + "epoch": 1.6274983527344609, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.58636474609375, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7202588319778442, + "num_tokens": 383246301.0, + "step": 14820 + }, + { + "epoch": 1.6276081704370746, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.187267541885376, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7009742856025696, + "num_tokens": 383276684.0, + "step": 14821 + }, + { + "epoch": 1.6277179881396882, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.609665632247925, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7255951166152954, + "num_tokens": 383298194.0, + "step": 14822 + }, + { + "epoch": 1.6278278058423017, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5754518508911133, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7410352826118469, + "num_tokens": 383319361.0, + "step": 14823 + }, + { + "epoch": 1.6279376235449154, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1020541191101074, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7172249555587769, + "num_tokens": 383349452.0, + "step": 14824 + }, + { + "epoch": 1.6280474412475292, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.260404348373413, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6996948719024658, + "num_tokens": 383380037.0, + "step": 14825 + }, + { + "epoch": 1.6281572589501427, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2825887203216553, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7307292222976685, + "num_tokens": 383405249.0, + "step": 14826 + }, + { + "epoch": 1.6282670766527563, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3713204860687256, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6927087306976318, + "num_tokens": 383430728.0, + "step": 14827 + }, + { + "epoch": 1.62837689435537, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.175196647644043, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.7091711163520813, + "num_tokens": 383460877.0, + "step": 14828 + }, + { + "epoch": 1.6284867120579838, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.570507526397705, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7163599729537964, + "num_tokens": 383484181.0, + "step": 14829 + }, + { + "epoch": 1.6285965297605975, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1792914867401123, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7075648307800293, + "num_tokens": 383513323.0, + "step": 14830 + }, + { + "epoch": 1.628706347463211, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3014605045318604, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7340008616447449, + "num_tokens": 383538472.0, + "step": 14831 + }, + { + "epoch": 1.6288161651658246, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4437642097473145, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7225468754768372, + "num_tokens": 383563054.0, + "step": 14832 + }, + { + "epoch": 1.6289259828684384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.536595106124878, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7066560983657837, + "num_tokens": 383586167.0, + "step": 14833 + }, + { + "epoch": 1.6290358005710521, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6082425117492676, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7283492088317871, + "num_tokens": 383609904.0, + "step": 14834 + }, + { + "epoch": 1.6291456182736657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6119091510772705, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7133941650390625, + "num_tokens": 383631601.0, + "step": 14835 + }, + { + "epoch": 1.6292554359762794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.566478967666626, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7270973920822144, + "num_tokens": 383653670.0, + "step": 14836 + }, + { + "epoch": 1.629365253678893, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2253952026367188, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6958719491958618, + "num_tokens": 383683590.0, + "step": 14837 + }, + { + "epoch": 1.6294750713815067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.11806321144104, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7243564128875732, + "num_tokens": 383716512.0, + "step": 14838 + }, + { + "epoch": 1.6295848890841205, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2375409603118896, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6891540884971619, + "num_tokens": 383748673.0, + "step": 14839 + }, + { + "epoch": 1.629694706786734, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.346081256866455, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7264734506607056, + "num_tokens": 383775985.0, + "step": 14840 + }, + { + "epoch": 1.6298045244893475, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3452043533325195, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6980106830596924, + "num_tokens": 383803403.0, + "step": 14841 + }, + { + "epoch": 1.6299143421919613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3493220806121826, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7307085990905762, + "num_tokens": 383829954.0, + "step": 14842 + }, + { + "epoch": 1.630024159894575, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5762596130371094, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7076202630996704, + "num_tokens": 383852254.0, + "step": 14843 + }, + { + "epoch": 1.6301339775971888, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.660628318786621, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7037016749382019, + "num_tokens": 383873300.0, + "step": 14844 + }, + { + "epoch": 1.6302437952998023, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3853096961975098, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7206642627716064, + "num_tokens": 383898671.0, + "step": 14845 + }, + { + "epoch": 1.6303536130024159, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4715843200683594, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7104699611663818, + "num_tokens": 383924445.0, + "step": 14846 + }, + { + "epoch": 1.6304634307050296, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2535204887390137, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7128403186798096, + "num_tokens": 383952449.0, + "step": 14847 + }, + { + "epoch": 1.6305732484076434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.648474931716919, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7145212888717651, + "num_tokens": 383976174.0, + "step": 14848 + }, + { + "epoch": 1.630683066110257, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.336104154586792, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7131257653236389, + "num_tokens": 384003191.0, + "step": 14849 + }, + { + "epoch": 1.6307928838128707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.244936227798462, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7087976336479187, + "num_tokens": 384030783.0, + "step": 14850 + }, + { + "epoch": 1.6309027015154842, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.418272018432617, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6851898431777954, + "num_tokens": 384061920.0, + "step": 14851 + }, + { + "epoch": 1.631012519218098, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.519639015197754, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7250447273254395, + "num_tokens": 384086460.0, + "step": 14852 + }, + { + "epoch": 1.6311223369207117, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.676654577255249, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7217938899993896, + "num_tokens": 384110573.0, + "step": 14853 + }, + { + "epoch": 1.6312321546233253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.442263126373291, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7172834873199463, + "num_tokens": 384135820.0, + "step": 14854 + }, + { + "epoch": 1.6313419723259388, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.293471574783325, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.70759117603302, + "num_tokens": 384161870.0, + "step": 14855 + }, + { + "epoch": 1.6314517900285526, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5919079780578613, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.7062283754348755, + "num_tokens": 384185167.0, + "step": 14856 + }, + { + "epoch": 1.6315616077311663, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4417552947998047, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7270562052726746, + "num_tokens": 384209478.0, + "step": 14857 + }, + { + "epoch": 1.63167142543378, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.223616123199463, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7061727046966553, + "num_tokens": 384238523.0, + "step": 14858 + }, + { + "epoch": 1.6317812431363936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2302844524383545, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7248231172561646, + "num_tokens": 384264608.0, + "step": 14859 + }, + { + "epoch": 1.6318910608390071, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.327944040298462, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7076846957206726, + "num_tokens": 384292304.0, + "step": 14860 + }, + { + "epoch": 1.632000878541621, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.409942388534546, + "learning_rate": 1e-06, + "loss": 1.0909, + "mean_token_accuracy": 0.6775909662246704, + "num_tokens": 384322313.0, + "step": 14861 + }, + { + "epoch": 1.6321106962442347, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3644375801086426, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.69870924949646, + "num_tokens": 384351098.0, + "step": 14862 + }, + { + "epoch": 1.6322205139468482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.480008363723755, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7102524042129517, + "num_tokens": 384376359.0, + "step": 14863 + }, + { + "epoch": 1.6323303316494617, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.1942405700683594, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7098325490951538, + "num_tokens": 384404628.0, + "step": 14864 + }, + { + "epoch": 1.6324401493520755, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 2.3432655334472656, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7013149261474609, + "num_tokens": 384431541.0, + "step": 14865 + }, + { + "epoch": 1.6325499670546892, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2613048553466797, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6920596957206726, + "num_tokens": 384460588.0, + "step": 14866 + }, + { + "epoch": 1.632659784757303, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.704108715057373, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7257608771324158, + "num_tokens": 384481212.0, + "step": 14867 + }, + { + "epoch": 1.6327696024599165, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3267505168914795, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7379387617111206, + "num_tokens": 384507741.0, + "step": 14868 + }, + { + "epoch": 1.63287942016253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3125815391540527, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7134265899658203, + "num_tokens": 384532661.0, + "step": 14869 + }, + { + "epoch": 1.6329892378651438, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.371626853942871, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6815353631973267, + "num_tokens": 384560894.0, + "step": 14870 + }, + { + "epoch": 1.6330990555677576, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2985758781433105, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6962675452232361, + "num_tokens": 384588914.0, + "step": 14871 + }, + { + "epoch": 1.6332088732703713, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.563056468963623, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.697425901889801, + "num_tokens": 384616629.0, + "step": 14872 + }, + { + "epoch": 1.6333186909729849, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.306150436401367, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6997562646865845, + "num_tokens": 384647781.0, + "step": 14873 + }, + { + "epoch": 1.6334285086755984, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5243468284606934, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6991548538208008, + "num_tokens": 384670700.0, + "step": 14874 + }, + { + "epoch": 1.6335383263782122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7312986850738525, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.722806990146637, + "num_tokens": 384689528.0, + "step": 14875 + }, + { + "epoch": 1.633648144080826, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.120075225830078, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6931943893432617, + "num_tokens": 384723911.0, + "step": 14876 + }, + { + "epoch": 1.6337579617834395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.422244071960449, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.7087219953536987, + "num_tokens": 384750167.0, + "step": 14877 + }, + { + "epoch": 1.633867779486053, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.432668447494507, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7530355453491211, + "num_tokens": 384775381.0, + "step": 14878 + }, + { + "epoch": 1.6339775971886668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8036844730377197, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7161210179328918, + "num_tokens": 384795819.0, + "step": 14879 + }, + { + "epoch": 1.6340874148912805, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.476902484893799, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7070437669754028, + "num_tokens": 384818800.0, + "step": 14880 + }, + { + "epoch": 1.6341972325938943, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3276374340057373, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6992813944816589, + "num_tokens": 384844704.0, + "step": 14881 + }, + { + "epoch": 1.6343070502965078, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.351785182952881, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7101823091506958, + "num_tokens": 384871538.0, + "step": 14882 + }, + { + "epoch": 1.6344168679991213, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4045557975769043, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6947374939918518, + "num_tokens": 384899619.0, + "step": 14883 + }, + { + "epoch": 1.634526685701735, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4570906162261963, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7103781700134277, + "num_tokens": 384927967.0, + "step": 14884 + }, + { + "epoch": 1.6346365034043489, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7177464962005615, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.731508195400238, + "num_tokens": 384950049.0, + "step": 14885 + }, + { + "epoch": 1.6347463211069626, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5574095249176025, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7121785879135132, + "num_tokens": 384974721.0, + "step": 14886 + }, + { + "epoch": 1.6348561388095761, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7078728675842285, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7173969149589539, + "num_tokens": 384997463.0, + "step": 14887 + }, + { + "epoch": 1.6349659565121897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.9741358757019043, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7237133979797363, + "num_tokens": 385015660.0, + "step": 14888 + }, + { + "epoch": 1.6350757742148034, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.481801748275757, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7225309610366821, + "num_tokens": 385040549.0, + "step": 14889 + }, + { + "epoch": 1.6351855919174172, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.850501298904419, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7450140714645386, + "num_tokens": 385061746.0, + "step": 14890 + }, + { + "epoch": 1.6352954096200307, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8026604652404785, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7060500383377075, + "num_tokens": 385082821.0, + "step": 14891 + }, + { + "epoch": 1.6354052273226443, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.180034875869751, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6953800916671753, + "num_tokens": 385115248.0, + "step": 14892 + }, + { + "epoch": 1.635515045025258, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.414203643798828, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7131433486938477, + "num_tokens": 385139636.0, + "step": 14893 + }, + { + "epoch": 1.6356248627278718, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.487340211868286, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7062997817993164, + "num_tokens": 385164473.0, + "step": 14894 + }, + { + "epoch": 1.6357346804304855, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3231306076049805, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6884763240814209, + "num_tokens": 385192452.0, + "step": 14895 + }, + { + "epoch": 1.635844498133099, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3355064392089844, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7159938812255859, + "num_tokens": 385215938.0, + "step": 14896 + }, + { + "epoch": 1.6359543158357126, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.417466163635254, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7200096845626831, + "num_tokens": 385239699.0, + "step": 14897 + }, + { + "epoch": 1.6360641335383264, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.124955654144287, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6901664733886719, + "num_tokens": 385271485.0, + "step": 14898 + }, + { + "epoch": 1.6361739512409401, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.709174394607544, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7031083703041077, + "num_tokens": 385292962.0, + "step": 14899 + }, + { + "epoch": 1.6362837689435537, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4074394702911377, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7125047445297241, + "num_tokens": 385318331.0, + "step": 14900 + }, + { + "epoch": 1.6363935866461674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5460264682769775, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7055832743644714, + "num_tokens": 385342660.0, + "step": 14901 + }, + { + "epoch": 1.636503404348781, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2757458686828613, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7128478288650513, + "num_tokens": 385371735.0, + "step": 14902 + }, + { + "epoch": 1.6366132220513947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.501814126968384, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7233607769012451, + "num_tokens": 385396048.0, + "step": 14903 + }, + { + "epoch": 1.6367230397540085, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4451980590820312, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7024620175361633, + "num_tokens": 385421690.0, + "step": 14904 + }, + { + "epoch": 1.636832857456622, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.505093812942505, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7094645500183105, + "num_tokens": 385444381.0, + "step": 14905 + }, + { + "epoch": 1.6369426751592355, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.312511444091797, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7167894840240479, + "num_tokens": 385470342.0, + "step": 14906 + }, + { + "epoch": 1.6370524928618493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.420558452606201, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7285734415054321, + "num_tokens": 385493926.0, + "step": 14907 + }, + { + "epoch": 1.637162310564463, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4877383708953857, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7143762707710266, + "num_tokens": 385520209.0, + "step": 14908 + }, + { + "epoch": 1.6372721282670768, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4100983142852783, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7103271484375, + "num_tokens": 385546552.0, + "step": 14909 + }, + { + "epoch": 1.6373819459696903, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.381622076034546, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6981729865074158, + "num_tokens": 385573161.0, + "step": 14910 + }, + { + "epoch": 1.6374917636723039, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4261157512664795, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7509724497795105, + "num_tokens": 385597104.0, + "step": 14911 + }, + { + "epoch": 1.6376015813749176, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4475183486938477, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7267274856567383, + "num_tokens": 385619874.0, + "step": 14912 + }, + { + "epoch": 1.6377113990775314, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.483628749847412, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6907100081443787, + "num_tokens": 385646407.0, + "step": 14913 + }, + { + "epoch": 1.637821216780145, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5002708435058594, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7101302742958069, + "num_tokens": 385671319.0, + "step": 14914 + }, + { + "epoch": 1.6379310344827587, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3969385623931885, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6888195276260376, + "num_tokens": 385701247.0, + "step": 14915 + }, + { + "epoch": 1.6380408521853722, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.286803722381592, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6885011196136475, + "num_tokens": 385732322.0, + "step": 14916 + }, + { + "epoch": 1.638150669887986, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.691486120223999, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7159380912780762, + "num_tokens": 385754702.0, + "step": 14917 + }, + { + "epoch": 1.6382604875905997, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2471792697906494, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6961833238601685, + "num_tokens": 385785464.0, + "step": 14918 + }, + { + "epoch": 1.6383703052932133, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5596485137939453, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7147219777107239, + "num_tokens": 385810503.0, + "step": 14919 + }, + { + "epoch": 1.6384801229958268, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.034245014190674, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.722885012626648, + "num_tokens": 385829758.0, + "step": 14920 + }, + { + "epoch": 1.6385899406984406, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.896867036819458, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7166069746017456, + "num_tokens": 385849368.0, + "step": 14921 + }, + { + "epoch": 1.6386997584010543, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.249325752258301, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7057388424873352, + "num_tokens": 385876430.0, + "step": 14922 + }, + { + "epoch": 1.638809576103668, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5060393810272217, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7208732962608337, + "num_tokens": 385899573.0, + "step": 14923 + }, + { + "epoch": 1.6389193938062816, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.499103307723999, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7174396514892578, + "num_tokens": 385922626.0, + "step": 14924 + }, + { + "epoch": 1.6390292115088951, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4904677867889404, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7153281569480896, + "num_tokens": 385947999.0, + "step": 14925 + }, + { + "epoch": 1.639139029211509, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8089523315429688, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7149344086647034, + "num_tokens": 385968647.0, + "step": 14926 + }, + { + "epoch": 1.6392488469141226, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.664626121520996, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7274575233459473, + "num_tokens": 385988918.0, + "step": 14927 + }, + { + "epoch": 1.6393586646167362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.365147590637207, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7138712406158447, + "num_tokens": 386014473.0, + "step": 14928 + }, + { + "epoch": 1.6394684823193497, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.128039598464966, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7102925181388855, + "num_tokens": 386044083.0, + "step": 14929 + }, + { + "epoch": 1.6395783000219635, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6063313484191895, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7098906636238098, + "num_tokens": 386066503.0, + "step": 14930 + }, + { + "epoch": 1.6396881177245772, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.042391061782837, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6983510255813599, + "num_tokens": 386100803.0, + "step": 14931 + }, + { + "epoch": 1.639797935427191, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.196603536605835, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6998621225357056, + "num_tokens": 386133291.0, + "step": 14932 + }, + { + "epoch": 1.6399077531298045, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1673128604888916, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6918271780014038, + "num_tokens": 386165398.0, + "step": 14933 + }, + { + "epoch": 1.640017570832418, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1631851196289062, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.697529137134552, + "num_tokens": 386196459.0, + "step": 14934 + }, + { + "epoch": 1.6401273885350318, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5003888607025146, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.697201132774353, + "num_tokens": 386221217.0, + "step": 14935 + }, + { + "epoch": 1.6402372062376456, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4581375122070312, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7150300145149231, + "num_tokens": 386244454.0, + "step": 14936 + }, + { + "epoch": 1.6403470239402593, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4643781185150146, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7032562494277954, + "num_tokens": 386271152.0, + "step": 14937 + }, + { + "epoch": 1.6404568416428729, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 4.105122089385986, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6991985440254211, + "num_tokens": 386296627.0, + "step": 14938 + }, + { + "epoch": 1.6405666593454864, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.435579776763916, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7195402383804321, + "num_tokens": 386322958.0, + "step": 14939 + }, + { + "epoch": 1.6406764770481002, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.453110694885254, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7237443923950195, + "num_tokens": 386345027.0, + "step": 14940 + }, + { + "epoch": 1.640786294750714, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.340010404586792, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7149742841720581, + "num_tokens": 386375754.0, + "step": 14941 + }, + { + "epoch": 1.6408961124533274, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3594439029693604, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6944875121116638, + "num_tokens": 386405499.0, + "step": 14942 + }, + { + "epoch": 1.641005930155941, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.188838005065918, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7193261384963989, + "num_tokens": 386435079.0, + "step": 14943 + }, + { + "epoch": 1.6411157478585547, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2356228828430176, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7038588523864746, + "num_tokens": 386461966.0, + "step": 14944 + }, + { + "epoch": 1.6412255655611685, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4203438758850098, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7166979312896729, + "num_tokens": 386487258.0, + "step": 14945 + }, + { + "epoch": 1.6413353832637823, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6074938774108887, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7012441158294678, + "num_tokens": 386511094.0, + "step": 14946 + }, + { + "epoch": 1.6414452009663958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.344493865966797, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.7008241415023804, + "num_tokens": 386537644.0, + "step": 14947 + }, + { + "epoch": 1.6415550186690093, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5291008949279785, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.7020756006240845, + "num_tokens": 386563414.0, + "step": 14948 + }, + { + "epoch": 1.641664836371623, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.443269729614258, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6963388919830322, + "num_tokens": 386588261.0, + "step": 14949 + }, + { + "epoch": 1.6417746540742368, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.248241901397705, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7163671851158142, + "num_tokens": 386618288.0, + "step": 14950 + }, + { + "epoch": 1.6418844717768504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.49084210395813, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.7025317549705505, + "num_tokens": 386641683.0, + "step": 14951 + }, + { + "epoch": 1.6419942894794641, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2355377674102783, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7261224985122681, + "num_tokens": 386669054.0, + "step": 14952 + }, + { + "epoch": 1.6421041071820777, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1261982917785645, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.716904878616333, + "num_tokens": 386698654.0, + "step": 14953 + }, + { + "epoch": 1.6422139248846914, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3692383766174316, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7023349404335022, + "num_tokens": 386725386.0, + "step": 14954 + }, + { + "epoch": 1.6423237425873052, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.408982992172241, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7130180597305298, + "num_tokens": 386748385.0, + "step": 14955 + }, + { + "epoch": 1.6424335602899187, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.346938371658325, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7172208428382874, + "num_tokens": 386775453.0, + "step": 14956 + }, + { + "epoch": 1.6425433779925322, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.318962335586548, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7215895056724548, + "num_tokens": 386802118.0, + "step": 14957 + }, + { + "epoch": 1.642653195695146, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.245181083679199, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6870995759963989, + "num_tokens": 386833173.0, + "step": 14958 + }, + { + "epoch": 1.6427630133977598, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.310931444168091, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6981569528579712, + "num_tokens": 386858346.0, + "step": 14959 + }, + { + "epoch": 1.6428728311003735, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3483264446258545, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7111269235610962, + "num_tokens": 386886388.0, + "step": 14960 + }, + { + "epoch": 1.642982648802987, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6979713439941406, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7316097021102905, + "num_tokens": 386906007.0, + "step": 14961 + }, + { + "epoch": 1.6430924665056006, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2912628650665283, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6998786926269531, + "num_tokens": 386933428.0, + "step": 14962 + }, + { + "epoch": 1.6432022842082143, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.202526092529297, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6964694261550903, + "num_tokens": 386964853.0, + "step": 14963 + }, + { + "epoch": 1.643312101910828, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2301836013793945, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6964124441146851, + "num_tokens": 386993593.0, + "step": 14964 + }, + { + "epoch": 1.6434219196134416, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.543372631072998, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6979531049728394, + "num_tokens": 387017013.0, + "step": 14965 + }, + { + "epoch": 1.6435317373160554, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.582322597503662, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7290307283401489, + "num_tokens": 387037142.0, + "step": 14966 + }, + { + "epoch": 1.643641555018669, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.637305498123169, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7220304608345032, + "num_tokens": 387058297.0, + "step": 14967 + }, + { + "epoch": 1.6437513727212827, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.580933094024658, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7135258913040161, + "num_tokens": 387079291.0, + "step": 14968 + }, + { + "epoch": 1.6438611904238964, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.632315158843994, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7031921148300171, + "num_tokens": 387100506.0, + "step": 14969 + }, + { + "epoch": 1.64397100812651, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3461318016052246, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6994161605834961, + "num_tokens": 387128650.0, + "step": 14970 + }, + { + "epoch": 1.6440808258291235, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4747607707977295, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7146189212799072, + "num_tokens": 387151054.0, + "step": 14971 + }, + { + "epoch": 1.6441906435317373, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5253870487213135, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7168699502944946, + "num_tokens": 387173317.0, + "step": 14972 + }, + { + "epoch": 1.644300461234351, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.638084888458252, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6995121240615845, + "num_tokens": 387195398.0, + "step": 14973 + }, + { + "epoch": 1.6444102789369648, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7843210697174072, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.71928870677948, + "num_tokens": 387216806.0, + "step": 14974 + }, + { + "epoch": 1.6445200966395783, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3507943153381348, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6987323760986328, + "num_tokens": 387244637.0, + "step": 14975 + }, + { + "epoch": 1.6446299143421919, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.369267463684082, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7082658410072327, + "num_tokens": 387271443.0, + "step": 14976 + }, + { + "epoch": 1.6447397320448056, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3023521900177, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6954516172409058, + "num_tokens": 387300901.0, + "step": 14977 + }, + { + "epoch": 1.6448495497474194, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.0676982402801514, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7339392900466919, + "num_tokens": 387319865.0, + "step": 14978 + }, + { + "epoch": 1.644959367450033, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.426473379135132, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7057757377624512, + "num_tokens": 387343059.0, + "step": 14979 + }, + { + "epoch": 1.6450691851526464, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.296785593032837, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7140710949897766, + "num_tokens": 387372396.0, + "step": 14980 + }, + { + "epoch": 1.6451790028552602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1054301261901855, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6744071841239929, + "num_tokens": 387405565.0, + "step": 14981 + }, + { + "epoch": 1.645288820557874, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6857845783233643, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7310866117477417, + "num_tokens": 387425517.0, + "step": 14982 + }, + { + "epoch": 1.6453986382604877, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.570556879043579, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7350008487701416, + "num_tokens": 387445500.0, + "step": 14983 + }, + { + "epoch": 1.6455084559631012, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7818689346313477, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7238389849662781, + "num_tokens": 387464702.0, + "step": 14984 + }, + { + "epoch": 1.6456182736657148, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4236316680908203, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7027255296707153, + "num_tokens": 387489147.0, + "step": 14985 + }, + { + "epoch": 1.6457280913683285, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.235114097595215, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6995865106582642, + "num_tokens": 387516359.0, + "step": 14986 + }, + { + "epoch": 1.6458379090709423, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5227694511413574, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.708141565322876, + "num_tokens": 387539169.0, + "step": 14987 + }, + { + "epoch": 1.645947726773556, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.657609462738037, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7336152791976929, + "num_tokens": 387561416.0, + "step": 14988 + }, + { + "epoch": 1.6460575444761696, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3637239933013916, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.686098575592041, + "num_tokens": 387588704.0, + "step": 14989 + }, + { + "epoch": 1.6461673621787831, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8167412281036377, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7095059156417847, + "num_tokens": 387609671.0, + "step": 14990 + }, + { + "epoch": 1.6462771798813969, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.140554428100586, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6906099915504456, + "num_tokens": 387644406.0, + "step": 14991 + }, + { + "epoch": 1.6463869975840106, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.135096311569214, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7092362642288208, + "num_tokens": 387675385.0, + "step": 14992 + }, + { + "epoch": 1.6464968152866242, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2405707836151123, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7211477756500244, + "num_tokens": 387702653.0, + "step": 14993 + }, + { + "epoch": 1.6466066329892377, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5370352268218994, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.732144296169281, + "num_tokens": 387726744.0, + "step": 14994 + }, + { + "epoch": 1.6467164506918515, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5430877208709717, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7178547382354736, + "num_tokens": 387749051.0, + "step": 14995 + }, + { + "epoch": 1.6468262683944652, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4550187587738037, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7036160230636597, + "num_tokens": 387773399.0, + "step": 14996 + }, + { + "epoch": 1.646936086097079, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4165427684783936, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7150514721870422, + "num_tokens": 387798562.0, + "step": 14997 + }, + { + "epoch": 1.6470459037996925, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.428408145904541, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.706291139125824, + "num_tokens": 387825123.0, + "step": 14998 + }, + { + "epoch": 1.647155721502306, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2096173763275146, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.698167622089386, + "num_tokens": 387855785.0, + "step": 14999 + }, + { + "epoch": 1.6472655392049198, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.298323392868042, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7056761980056763, + "num_tokens": 387885037.0, + "step": 15000 + }, + { + "epoch": 1.6473753569075336, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8635571002960205, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7265758514404297, + "num_tokens": 387903513.0, + "step": 15001 + }, + { + "epoch": 1.6474851746101473, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1582324504852295, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6919636130332947, + "num_tokens": 387935721.0, + "step": 15002 + }, + { + "epoch": 1.6475949923127609, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.326796293258667, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.717172384262085, + "num_tokens": 387960680.0, + "step": 15003 + }, + { + "epoch": 1.6477048100153744, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.331096887588501, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6926537752151489, + "num_tokens": 387990376.0, + "step": 15004 + }, + { + "epoch": 1.6478146277179881, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4469099044799805, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7215102910995483, + "num_tokens": 388013280.0, + "step": 15005 + }, + { + "epoch": 1.647924445420602, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2654411792755127, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6942490339279175, + "num_tokens": 388044228.0, + "step": 15006 + }, + { + "epoch": 1.6480342631232154, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.367544412612915, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7093493342399597, + "num_tokens": 388069822.0, + "step": 15007 + }, + { + "epoch": 1.648144080825829, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1483919620513916, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6964067816734314, + "num_tokens": 388102749.0, + "step": 15008 + }, + { + "epoch": 1.6482538985284427, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7337310314178467, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7259311676025391, + "num_tokens": 388122017.0, + "step": 15009 + }, + { + "epoch": 1.6483637162310565, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5040411949157715, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6994648575782776, + "num_tokens": 388146361.0, + "step": 15010 + }, + { + "epoch": 1.6484735339336702, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5310540199279785, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6879748106002808, + "num_tokens": 388170621.0, + "step": 15011 + }, + { + "epoch": 1.6485833516362838, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4059829711914062, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7156673073768616, + "num_tokens": 388198618.0, + "step": 15012 + }, + { + "epoch": 1.6486931693388973, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.45847749710083, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.7000281810760498, + "num_tokens": 388224203.0, + "step": 15013 + }, + { + "epoch": 1.648802987041511, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2875311374664307, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6867355108261108, + "num_tokens": 388256049.0, + "step": 15014 + }, + { + "epoch": 1.6489128047441248, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4394407272338867, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7125247120857239, + "num_tokens": 388280606.0, + "step": 15015 + }, + { + "epoch": 1.6490226224467384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.576227903366089, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6976326704025269, + "num_tokens": 388305205.0, + "step": 15016 + }, + { + "epoch": 1.6491324401493521, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.343651533126831, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.705836296081543, + "num_tokens": 388335252.0, + "step": 15017 + }, + { + "epoch": 1.6492422578519657, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.640509605407715, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7183892726898193, + "num_tokens": 388358098.0, + "step": 15018 + }, + { + "epoch": 1.6493520755545794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4144175052642822, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7319828271865845, + "num_tokens": 388381800.0, + "step": 15019 + }, + { + "epoch": 1.6494618932571932, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.459610939025879, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.696625828742981, + "num_tokens": 388406651.0, + "step": 15020 + }, + { + "epoch": 1.6495717109598067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4382805824279785, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7402449250221252, + "num_tokens": 388431031.0, + "step": 15021 + }, + { + "epoch": 1.6496815286624202, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.369176149368286, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6948873996734619, + "num_tokens": 388459428.0, + "step": 15022 + }, + { + "epoch": 1.649791346365034, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2417750358581543, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7192960977554321, + "num_tokens": 388488077.0, + "step": 15023 + }, + { + "epoch": 1.6499011640676478, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3167169094085693, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7127919793128967, + "num_tokens": 388516556.0, + "step": 15024 + }, + { + "epoch": 1.6500109817702615, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.562818765640259, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6983832120895386, + "num_tokens": 388542273.0, + "step": 15025 + }, + { + "epoch": 1.650120799472875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3425683975219727, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7144192457199097, + "num_tokens": 388568517.0, + "step": 15026 + }, + { + "epoch": 1.6502306171754886, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4888880252838135, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7175179719924927, + "num_tokens": 388590410.0, + "step": 15027 + }, + { + "epoch": 1.6503404348781023, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.667841672897339, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7378451824188232, + "num_tokens": 388614421.0, + "step": 15028 + }, + { + "epoch": 1.650450252580716, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2337496280670166, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7279190421104431, + "num_tokens": 388643445.0, + "step": 15029 + }, + { + "epoch": 1.6505600702833296, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4164066314697266, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6909489631652832, + "num_tokens": 388669637.0, + "step": 15030 + }, + { + "epoch": 1.6506698879859434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4597384929656982, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7217549085617065, + "num_tokens": 388692316.0, + "step": 15031 + }, + { + "epoch": 1.650779705688557, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.593129873275757, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7308062314987183, + "num_tokens": 388714361.0, + "step": 15032 + }, + { + "epoch": 1.6508895233911707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3490047454833984, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.703128457069397, + "num_tokens": 388742386.0, + "step": 15033 + }, + { + "epoch": 1.6509993410937844, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5698649883270264, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7029119729995728, + "num_tokens": 388765011.0, + "step": 15034 + }, + { + "epoch": 1.651109158796398, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4598121643066406, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7011464834213257, + "num_tokens": 388790939.0, + "step": 15035 + }, + { + "epoch": 1.6512189764990115, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.452312469482422, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7121485471725464, + "num_tokens": 388816139.0, + "step": 15036 + }, + { + "epoch": 1.6513287942016253, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.606670618057251, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7113912105560303, + "num_tokens": 388841709.0, + "step": 15037 + }, + { + "epoch": 1.651438611904239, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4304051399230957, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7027779221534729, + "num_tokens": 388868760.0, + "step": 15038 + }, + { + "epoch": 1.6515484296068528, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.492696523666382, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7179563641548157, + "num_tokens": 388889984.0, + "step": 15039 + }, + { + "epoch": 1.6516582473094663, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.373227834701538, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7146561145782471, + "num_tokens": 388915538.0, + "step": 15040 + }, + { + "epoch": 1.6517680650120798, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6719934940338135, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7308261394500732, + "num_tokens": 388935827.0, + "step": 15041 + }, + { + "epoch": 1.6518778827146936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5934293270111084, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7062456607818604, + "num_tokens": 388959234.0, + "step": 15042 + }, + { + "epoch": 1.6519877004173074, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.405829429626465, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6992689967155457, + "num_tokens": 388985059.0, + "step": 15043 + }, + { + "epoch": 1.652097518119921, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5381710529327393, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6867986917495728, + "num_tokens": 389008956.0, + "step": 15044 + }, + { + "epoch": 1.6522073358225344, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.447678327560425, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6946101188659668, + "num_tokens": 389035428.0, + "step": 15045 + }, + { + "epoch": 1.6523171535251482, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.443553924560547, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7087655067443848, + "num_tokens": 389060170.0, + "step": 15046 + }, + { + "epoch": 1.652426971227762, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4289286136627197, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.695238471031189, + "num_tokens": 389087000.0, + "step": 15047 + }, + { + "epoch": 1.6525367889303757, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5015974044799805, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6951704621315002, + "num_tokens": 389110650.0, + "step": 15048 + }, + { + "epoch": 1.6526466066329892, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.702702760696411, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7175800800323486, + "num_tokens": 389131713.0, + "step": 15049 + }, + { + "epoch": 1.6527564243356028, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4504475593566895, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6894422173500061, + "num_tokens": 389155984.0, + "step": 15050 + }, + { + "epoch": 1.6528662420382165, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.776824712753296, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.711632490158081, + "num_tokens": 389175024.0, + "step": 15051 + }, + { + "epoch": 1.6529760597408303, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.514064073562622, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7152295708656311, + "num_tokens": 389198396.0, + "step": 15052 + }, + { + "epoch": 1.653085877443444, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.387240171432495, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7083988785743713, + "num_tokens": 389224011.0, + "step": 15053 + }, + { + "epoch": 1.6531956951460576, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2706868648529053, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7161030173301697, + "num_tokens": 389250356.0, + "step": 15054 + }, + { + "epoch": 1.653305512848671, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.454587459564209, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7161072492599487, + "num_tokens": 389274008.0, + "step": 15055 + }, + { + "epoch": 1.6534153305512849, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5778696537017822, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6788558959960938, + "num_tokens": 389297790.0, + "step": 15056 + }, + { + "epoch": 1.6535251482538986, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1729161739349365, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6967058181762695, + "num_tokens": 389329161.0, + "step": 15057 + }, + { + "epoch": 1.6536349659565122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3244872093200684, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7186873555183411, + "num_tokens": 389355768.0, + "step": 15058 + }, + { + "epoch": 1.6537447836591257, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.155393600463867, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.7077105045318604, + "num_tokens": 389386013.0, + "step": 15059 + }, + { + "epoch": 1.6538546013617395, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.426071882247925, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6966014504432678, + "num_tokens": 389412209.0, + "step": 15060 + }, + { + "epoch": 1.6539644190643532, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.033048629760742, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6934508681297302, + "num_tokens": 389446399.0, + "step": 15061 + }, + { + "epoch": 1.654074236766967, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2692651748657227, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.748634934425354, + "num_tokens": 389471148.0, + "step": 15062 + }, + { + "epoch": 1.6541840544695805, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.059375524520874, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6875671148300171, + "num_tokens": 389508397.0, + "step": 15063 + }, + { + "epoch": 1.654293872172194, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.464869499206543, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7016916275024414, + "num_tokens": 389535562.0, + "step": 15064 + }, + { + "epoch": 1.6544036898748078, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.264930248260498, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7260963916778564, + "num_tokens": 389563004.0, + "step": 15065 + }, + { + "epoch": 1.6545135075774215, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3668715953826904, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7485249042510986, + "num_tokens": 389586088.0, + "step": 15066 + }, + { + "epoch": 1.6546233252800353, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2810475826263428, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7266833782196045, + "num_tokens": 389612021.0, + "step": 15067 + }, + { + "epoch": 1.6547331429826488, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1199464797973633, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7151926755905151, + "num_tokens": 389642765.0, + "step": 15068 + }, + { + "epoch": 1.6548429606852624, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.473501205444336, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7288869619369507, + "num_tokens": 389665514.0, + "step": 15069 + }, + { + "epoch": 1.6549527783878761, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4097142219543457, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6981830596923828, + "num_tokens": 389692152.0, + "step": 15070 + }, + { + "epoch": 1.65506259609049, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6075057983398438, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7125790119171143, + "num_tokens": 389717493.0, + "step": 15071 + }, + { + "epoch": 1.6551724137931034, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.506648302078247, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7006362080574036, + "num_tokens": 389741451.0, + "step": 15072 + }, + { + "epoch": 1.655282231495717, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4128646850585938, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7098054885864258, + "num_tokens": 389767007.0, + "step": 15073 + }, + { + "epoch": 1.6553920491983307, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6606292724609375, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6889404058456421, + "num_tokens": 389788434.0, + "step": 15074 + }, + { + "epoch": 1.6555018669009445, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3536057472229004, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.693340003490448, + "num_tokens": 389815287.0, + "step": 15075 + }, + { + "epoch": 1.6556116846035582, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.669529438018799, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.727205753326416, + "num_tokens": 389836107.0, + "step": 15076 + }, + { + "epoch": 1.6557215023061718, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.381849527359009, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7197030782699585, + "num_tokens": 389860566.0, + "step": 15077 + }, + { + "epoch": 1.6558313200087853, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.174849033355713, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.721514880657196, + "num_tokens": 389891930.0, + "step": 15078 + }, + { + "epoch": 1.655941137711399, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5496554374694824, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.700369119644165, + "num_tokens": 389916670.0, + "step": 15079 + }, + { + "epoch": 1.6560509554140128, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1779823303222656, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6899036169052124, + "num_tokens": 389945835.0, + "step": 15080 + }, + { + "epoch": 1.6561607731166264, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3139488697052, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7470024824142456, + "num_tokens": 389972301.0, + "step": 15081 + }, + { + "epoch": 1.65627059081924, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7738800048828125, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7387281656265259, + "num_tokens": 389992002.0, + "step": 15082 + }, + { + "epoch": 1.6563804085218536, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.429271936416626, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6934790015220642, + "num_tokens": 390018505.0, + "step": 15083 + }, + { + "epoch": 1.6564902262244674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.591702938079834, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7189165353775024, + "num_tokens": 390040048.0, + "step": 15084 + }, + { + "epoch": 1.6566000439270812, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3791849613189697, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.701056957244873, + "num_tokens": 390068920.0, + "step": 15085 + }, + { + "epoch": 1.6567098616296947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5502521991729736, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6856445670127869, + "num_tokens": 390095080.0, + "step": 15086 + }, + { + "epoch": 1.6568196793323082, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2225918769836426, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7192625999450684, + "num_tokens": 390123902.0, + "step": 15087 + }, + { + "epoch": 1.656929497034922, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.360764980316162, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6981474161148071, + "num_tokens": 390149633.0, + "step": 15088 + }, + { + "epoch": 1.6570393147375357, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5193123817443848, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6928496360778809, + "num_tokens": 390173020.0, + "step": 15089 + }, + { + "epoch": 1.6571491324401495, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4010818004608154, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7019025683403015, + "num_tokens": 390198722.0, + "step": 15090 + }, + { + "epoch": 1.657258950142763, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.436647891998291, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.718743622303009, + "num_tokens": 390226112.0, + "step": 15091 + }, + { + "epoch": 1.6573687678453766, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3518803119659424, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7246091365814209, + "num_tokens": 390250826.0, + "step": 15092 + }, + { + "epoch": 1.6574785855479903, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5923006534576416, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7178507447242737, + "num_tokens": 390273521.0, + "step": 15093 + }, + { + "epoch": 1.657588403250604, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4330198764801025, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7183912992477417, + "num_tokens": 390300333.0, + "step": 15094 + }, + { + "epoch": 1.6576982209532176, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.329071044921875, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7380271553993225, + "num_tokens": 390327090.0, + "step": 15095 + }, + { + "epoch": 1.6578080386558314, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4659805297851562, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6963916420936584, + "num_tokens": 390352112.0, + "step": 15096 + }, + { + "epoch": 1.657917856358445, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2756550312042236, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7282981872558594, + "num_tokens": 390379128.0, + "step": 15097 + }, + { + "epoch": 1.6580276740610587, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.576859712600708, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7101272344589233, + "num_tokens": 390402702.0, + "step": 15098 + }, + { + "epoch": 1.6581374917636724, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7003533840179443, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.7008689641952515, + "num_tokens": 390425482.0, + "step": 15099 + }, + { + "epoch": 1.658247309466286, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.553488254547119, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7130353450775146, + "num_tokens": 390448680.0, + "step": 15100 + }, + { + "epoch": 1.6583571271688995, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.012593746185303, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.689309298992157, + "num_tokens": 390475776.0, + "step": 15101 + }, + { + "epoch": 1.6584669448715132, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5976319313049316, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7084391713142395, + "num_tokens": 390499965.0, + "step": 15102 + }, + { + "epoch": 1.658576762574127, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5291242599487305, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7194406986236572, + "num_tokens": 390525663.0, + "step": 15103 + }, + { + "epoch": 1.6586865802767408, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5666918754577637, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6976807117462158, + "num_tokens": 390549509.0, + "step": 15104 + }, + { + "epoch": 1.6587963979793543, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3144783973693848, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6893866062164307, + "num_tokens": 390577154.0, + "step": 15105 + }, + { + "epoch": 1.6589062156819678, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3207600116729736, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7036810517311096, + "num_tokens": 390606950.0, + "step": 15106 + }, + { + "epoch": 1.6590160333845816, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.559208869934082, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.697094202041626, + "num_tokens": 390630059.0, + "step": 15107 + }, + { + "epoch": 1.6591258510871953, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4308419227600098, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7105856537818909, + "num_tokens": 390656855.0, + "step": 15108 + }, + { + "epoch": 1.6592356687898089, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.579028606414795, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7360773682594299, + "num_tokens": 390681132.0, + "step": 15109 + }, + { + "epoch": 1.6593454864924224, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.345923662185669, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7084988355636597, + "num_tokens": 390710201.0, + "step": 15110 + }, + { + "epoch": 1.6594553041950362, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.343515157699585, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7308118939399719, + "num_tokens": 390735305.0, + "step": 15111 + }, + { + "epoch": 1.65956512189765, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.809760570526123, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7102193832397461, + "num_tokens": 390760313.0, + "step": 15112 + }, + { + "epoch": 1.6596749396002637, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4955897331237793, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7152647376060486, + "num_tokens": 390782847.0, + "step": 15113 + }, + { + "epoch": 1.6597847573028772, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.442349910736084, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7063696980476379, + "num_tokens": 390808668.0, + "step": 15114 + }, + { + "epoch": 1.6598945750054908, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.388942241668701, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.700360894203186, + "num_tokens": 390835133.0, + "step": 15115 + }, + { + "epoch": 1.6600043927081045, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3434479236602783, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7204277515411377, + "num_tokens": 390860844.0, + "step": 15116 + }, + { + "epoch": 1.6601142104107183, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8072688579559326, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7098974585533142, + "num_tokens": 390881447.0, + "step": 15117 + }, + { + "epoch": 1.660224028113332, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.204604387283325, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.7080463171005249, + "num_tokens": 390914311.0, + "step": 15118 + }, + { + "epoch": 1.6603338458159456, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6255252361297607, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7335168123245239, + "num_tokens": 390936130.0, + "step": 15119 + }, + { + "epoch": 1.660443663518559, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3099114894866943, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7246390581130981, + "num_tokens": 390963900.0, + "step": 15120 + }, + { + "epoch": 1.6605534812211729, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.330538749694824, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.731817364692688, + "num_tokens": 390988318.0, + "step": 15121 + }, + { + "epoch": 1.6606632989237866, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.435033082962036, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.7092111706733704, + "num_tokens": 391013088.0, + "step": 15122 + }, + { + "epoch": 1.6607731166264001, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5046911239624023, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.705817699432373, + "num_tokens": 391038533.0, + "step": 15123 + }, + { + "epoch": 1.6608829343290137, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5207698345184326, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7054438591003418, + "num_tokens": 391063810.0, + "step": 15124 + }, + { + "epoch": 1.6609927520316274, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4586877822875977, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6946038007736206, + "num_tokens": 391089542.0, + "step": 15125 + }, + { + "epoch": 1.6611025697342412, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4585177898406982, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7340244650840759, + "num_tokens": 391113833.0, + "step": 15126 + }, + { + "epoch": 1.661212387436855, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4287798404693604, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7200023531913757, + "num_tokens": 391138425.0, + "step": 15127 + }, + { + "epoch": 1.6613222051394685, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5187652111053467, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7352567315101624, + "num_tokens": 391161315.0, + "step": 15128 + }, + { + "epoch": 1.661432022842082, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4311561584472656, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7196815609931946, + "num_tokens": 391185436.0, + "step": 15129 + }, + { + "epoch": 1.6615418405446958, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5694422721862793, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7147813439369202, + "num_tokens": 391209066.0, + "step": 15130 + }, + { + "epoch": 1.6616516582473095, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8968796730041504, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7109953761100769, + "num_tokens": 391229470.0, + "step": 15131 + }, + { + "epoch": 1.661761475949923, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2181143760681152, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7103811502456665, + "num_tokens": 391259599.0, + "step": 15132 + }, + { + "epoch": 1.6618712936525368, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4747118949890137, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7190328240394592, + "num_tokens": 391284759.0, + "step": 15133 + }, + { + "epoch": 1.6619811113551504, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.405332326889038, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.714351236820221, + "num_tokens": 391310964.0, + "step": 15134 + }, + { + "epoch": 1.6620909290577641, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3988518714904785, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7190784215927124, + "num_tokens": 391336718.0, + "step": 15135 + }, + { + "epoch": 1.6622007467603779, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3545820713043213, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6971328258514404, + "num_tokens": 391363522.0, + "step": 15136 + }, + { + "epoch": 1.6623105644629914, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2969536781311035, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7162160277366638, + "num_tokens": 391390172.0, + "step": 15137 + }, + { + "epoch": 1.662420382165605, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.364354133605957, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7054758667945862, + "num_tokens": 391416941.0, + "step": 15138 + }, + { + "epoch": 1.6625301998682187, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2465171813964844, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6962159276008606, + "num_tokens": 391444599.0, + "step": 15139 + }, + { + "epoch": 1.6626400175708325, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2276933193206787, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6968812942504883, + "num_tokens": 391471679.0, + "step": 15140 + }, + { + "epoch": 1.6627498352734462, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.249265670776367, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7056706547737122, + "num_tokens": 391500427.0, + "step": 15141 + }, + { + "epoch": 1.6628596529760598, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.784424066543579, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7430035471916199, + "num_tokens": 391526531.0, + "step": 15142 + }, + { + "epoch": 1.6629694706786733, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.357881784439087, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.737140417098999, + "num_tokens": 391549471.0, + "step": 15143 + }, + { + "epoch": 1.663079288381287, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4457645416259766, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7181257009506226, + "num_tokens": 391572180.0, + "step": 15144 + }, + { + "epoch": 1.6631891060839008, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.364304304122925, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7128689289093018, + "num_tokens": 391598234.0, + "step": 15145 + }, + { + "epoch": 1.6632989237865143, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.28047251701355, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6939870715141296, + "num_tokens": 391627804.0, + "step": 15146 + }, + { + "epoch": 1.663408741489128, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6502323150634766, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7211438417434692, + "num_tokens": 391650198.0, + "step": 15147 + }, + { + "epoch": 1.6635185591917416, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3003804683685303, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7109975814819336, + "num_tokens": 391679974.0, + "step": 15148 + }, + { + "epoch": 1.6636283768943554, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.104444980621338, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.707944929599762, + "num_tokens": 391706421.0, + "step": 15149 + }, + { + "epoch": 1.6637381945969691, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2454676628112793, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7053173780441284, + "num_tokens": 391737789.0, + "step": 15150 + }, + { + "epoch": 1.6638480122995827, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.158097743988037, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7208869457244873, + "num_tokens": 391767656.0, + "step": 15151 + }, + { + "epoch": 1.6639578300021962, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.509521245956421, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7181911468505859, + "num_tokens": 391788356.0, + "step": 15152 + }, + { + "epoch": 1.66406764770481, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.256803512573242, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7053723335266113, + "num_tokens": 391817753.0, + "step": 15153 + }, + { + "epoch": 1.6641774654074237, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4635167121887207, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7139600515365601, + "num_tokens": 391842091.0, + "step": 15154 + }, + { + "epoch": 1.6642872831100375, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2671544551849365, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6970820426940918, + "num_tokens": 391870270.0, + "step": 15155 + }, + { + "epoch": 1.664397100812651, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0625603199005127, + "learning_rate": 1e-06, + "loss": 1.1467, + "mean_token_accuracy": 0.6723983883857727, + "num_tokens": 391903803.0, + "step": 15156 + }, + { + "epoch": 1.6645069185152646, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4887824058532715, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7194895148277283, + "num_tokens": 391926968.0, + "step": 15157 + }, + { + "epoch": 1.6646167362178783, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.374138832092285, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6963338851928711, + "num_tokens": 391954503.0, + "step": 15158 + }, + { + "epoch": 1.664726553920492, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6687209606170654, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7090281844139099, + "num_tokens": 391976220.0, + "step": 15159 + }, + { + "epoch": 1.6648363716231056, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.270894765853882, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7073808312416077, + "num_tokens": 392003237.0, + "step": 15160 + }, + { + "epoch": 1.6649461893257194, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.125356435775757, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7127593755722046, + "num_tokens": 392032988.0, + "step": 15161 + }, + { + "epoch": 1.665056007028333, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.333582878112793, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7305064797401428, + "num_tokens": 392057474.0, + "step": 15162 + }, + { + "epoch": 1.6651658247309467, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3271985054016113, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.710241436958313, + "num_tokens": 392084909.0, + "step": 15163 + }, + { + "epoch": 1.6652756424335604, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3161461353302, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6976975798606873, + "num_tokens": 392111931.0, + "step": 15164 + }, + { + "epoch": 1.665385460136174, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3091464042663574, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7224975228309631, + "num_tokens": 392138413.0, + "step": 15165 + }, + { + "epoch": 1.6654952778387875, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.360879898071289, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7165380716323853, + "num_tokens": 392164030.0, + "step": 15166 + }, + { + "epoch": 1.6656050955414012, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.330824851989746, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7106156945228577, + "num_tokens": 392189971.0, + "step": 15167 + }, + { + "epoch": 1.665714913244015, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.308759927749634, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7131509780883789, + "num_tokens": 392217168.0, + "step": 15168 + }, + { + "epoch": 1.6658247309466288, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2947275638580322, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.708667516708374, + "num_tokens": 392245303.0, + "step": 15169 + }, + { + "epoch": 1.6659345486492423, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3084516525268555, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7243832349777222, + "num_tokens": 392270791.0, + "step": 15170 + }, + { + "epoch": 1.6660443663518558, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4807748794555664, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7068421840667725, + "num_tokens": 392295622.0, + "step": 15171 + }, + { + "epoch": 1.6661541840544696, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3995602130889893, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.7024357318878174, + "num_tokens": 392322785.0, + "step": 15172 + }, + { + "epoch": 1.6662640017570833, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8421409130096436, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.716630220413208, + "num_tokens": 392343448.0, + "step": 15173 + }, + { + "epoch": 1.6663738194596969, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.8897461891174316, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7243852615356445, + "num_tokens": 392362200.0, + "step": 15174 + }, + { + "epoch": 1.6664836371623104, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.279996633529663, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7079280018806458, + "num_tokens": 392390515.0, + "step": 15175 + }, + { + "epoch": 1.6665934548649242, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.362907648086548, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7090007066726685, + "num_tokens": 392416679.0, + "step": 15176 + }, + { + "epoch": 1.666703272567538, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4903361797332764, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7084156274795532, + "num_tokens": 392441347.0, + "step": 15177 + }, + { + "epoch": 1.6668130902701517, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2436273097991943, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7025894522666931, + "num_tokens": 392472060.0, + "step": 15178 + }, + { + "epoch": 1.6669229079727652, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4945003986358643, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6946007609367371, + "num_tokens": 392494595.0, + "step": 15179 + }, + { + "epoch": 1.6670327256753787, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4394545555114746, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7143422365188599, + "num_tokens": 392521890.0, + "step": 15180 + }, + { + "epoch": 1.6671425433779925, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.542325735092163, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7068935632705688, + "num_tokens": 392543064.0, + "step": 15181 + }, + { + "epoch": 1.6672523610806063, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5754692554473877, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7175078392028809, + "num_tokens": 392565624.0, + "step": 15182 + }, + { + "epoch": 1.66736217878322, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.517228841781616, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7276402711868286, + "num_tokens": 392588081.0, + "step": 15183 + }, + { + "epoch": 1.6674719964858336, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3705029487609863, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7156157493591309, + "num_tokens": 392615830.0, + "step": 15184 + }, + { + "epoch": 1.667581814188447, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2417948246002197, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7260847091674805, + "num_tokens": 392643642.0, + "step": 15185 + }, + { + "epoch": 1.6676916318910608, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4006927013397217, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7067261934280396, + "num_tokens": 392668842.0, + "step": 15186 + }, + { + "epoch": 1.6678014495936746, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5705161094665527, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7243423461914062, + "num_tokens": 392691727.0, + "step": 15187 + }, + { + "epoch": 1.6679112672962881, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.30468487739563, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7087740302085876, + "num_tokens": 392718206.0, + "step": 15188 + }, + { + "epoch": 1.6680210849989017, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0430490970611572, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7109291553497314, + "num_tokens": 392749720.0, + "step": 15189 + }, + { + "epoch": 1.6681309027015154, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3681018352508545, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.692023515701294, + "num_tokens": 392775450.0, + "step": 15190 + }, + { + "epoch": 1.6682407204041292, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.260469675064087, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7033571004867554, + "num_tokens": 392805725.0, + "step": 15191 + }, + { + "epoch": 1.668350538106743, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5435469150543213, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.704338788986206, + "num_tokens": 392828935.0, + "step": 15192 + }, + { + "epoch": 1.6684603558093565, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.470369815826416, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7238431572914124, + "num_tokens": 392852371.0, + "step": 15193 + }, + { + "epoch": 1.66857017351197, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.439687967300415, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7139960527420044, + "num_tokens": 392876515.0, + "step": 15194 + }, + { + "epoch": 1.6686799912145838, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1681556701660156, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.704945981502533, + "num_tokens": 392909674.0, + "step": 15195 + }, + { + "epoch": 1.6687898089171975, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.0843100547790527, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.696081280708313, + "num_tokens": 392942641.0, + "step": 15196 + }, + { + "epoch": 1.668899626619811, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.9992752075195312, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6977413892745972, + "num_tokens": 392976520.0, + "step": 15197 + }, + { + "epoch": 1.6690094443224248, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5984740257263184, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.699725866317749, + "num_tokens": 392999207.0, + "step": 15198 + }, + { + "epoch": 1.6691192620250384, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1043009757995605, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7031600475311279, + "num_tokens": 393029558.0, + "step": 15199 + }, + { + "epoch": 1.669229079727652, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.449434518814087, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7431193590164185, + "num_tokens": 393051392.0, + "step": 15200 + }, + { + "epoch": 1.6693388974302659, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.329655647277832, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7368297576904297, + "num_tokens": 393076234.0, + "step": 15201 + }, + { + "epoch": 1.6694487151328794, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4067962169647217, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7321294546127319, + "num_tokens": 393099892.0, + "step": 15202 + }, + { + "epoch": 1.669558532835493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.250922679901123, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7034556865692139, + "num_tokens": 393128596.0, + "step": 15203 + }, + { + "epoch": 1.6696683505381067, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1720998287200928, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7030518054962158, + "num_tokens": 393159301.0, + "step": 15204 + }, + { + "epoch": 1.6697781682407205, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5478124618530273, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7168967723846436, + "num_tokens": 393181250.0, + "step": 15205 + }, + { + "epoch": 1.6698879859433342, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.315575361251831, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6932932138442993, + "num_tokens": 393208502.0, + "step": 15206 + }, + { + "epoch": 1.6699978036459477, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 7.070016384124756, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7129786610603333, + "num_tokens": 393234328.0, + "step": 15207 + }, + { + "epoch": 1.6701076213485613, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5764379501342773, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7136392593383789, + "num_tokens": 393257177.0, + "step": 15208 + }, + { + "epoch": 1.670217439051175, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.454690456390381, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6828011274337769, + "num_tokens": 393283471.0, + "step": 15209 + }, + { + "epoch": 1.6703272567537888, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.600928783416748, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7229477167129517, + "num_tokens": 393305955.0, + "step": 15210 + }, + { + "epoch": 1.6704370744564023, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3421504497528076, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7114285230636597, + "num_tokens": 393333716.0, + "step": 15211 + }, + { + "epoch": 1.670546892159016, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.167537212371826, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.721123218536377, + "num_tokens": 393362875.0, + "step": 15212 + }, + { + "epoch": 1.6706567098616296, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.597167491912842, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7144060730934143, + "num_tokens": 393383648.0, + "step": 15213 + }, + { + "epoch": 1.6707665275642434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.347405433654785, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7059059739112854, + "num_tokens": 393410602.0, + "step": 15214 + }, + { + "epoch": 1.6708763452668571, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.444742441177368, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7179659605026245, + "num_tokens": 393433729.0, + "step": 15215 + }, + { + "epoch": 1.6709861629694707, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4658849239349365, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7056210041046143, + "num_tokens": 393457823.0, + "step": 15216 + }, + { + "epoch": 1.6710959806720842, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.203371047973633, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6960377097129822, + "num_tokens": 393487977.0, + "step": 15217 + }, + { + "epoch": 1.671205798374698, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.289275884628296, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6802845001220703, + "num_tokens": 393520805.0, + "step": 15218 + }, + { + "epoch": 1.6713156160773117, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.202223300933838, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7000995874404907, + "num_tokens": 393550646.0, + "step": 15219 + }, + { + "epoch": 1.6714254337799255, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4699063301086426, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7264197468757629, + "num_tokens": 393572080.0, + "step": 15220 + }, + { + "epoch": 1.671535251482539, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.337874174118042, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6972169280052185, + "num_tokens": 393599908.0, + "step": 15221 + }, + { + "epoch": 1.6716450691851525, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4614245891571045, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7241978645324707, + "num_tokens": 393622182.0, + "step": 15222 + }, + { + "epoch": 1.6717548868877663, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3035826683044434, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7118432521820068, + "num_tokens": 393647899.0, + "step": 15223 + }, + { + "epoch": 1.67186470459038, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.437929630279541, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7255126237869263, + "num_tokens": 393671432.0, + "step": 15224 + }, + { + "epoch": 1.6719745222929936, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.370819330215454, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7242555618286133, + "num_tokens": 393695873.0, + "step": 15225 + }, + { + "epoch": 1.6720843399956071, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.475271701812744, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7140969038009644, + "num_tokens": 393721029.0, + "step": 15226 + }, + { + "epoch": 1.6721941576982209, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.152249813079834, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7069822549819946, + "num_tokens": 393751345.0, + "step": 15227 + }, + { + "epoch": 1.6723039754008346, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4178555011749268, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7037384510040283, + "num_tokens": 393777302.0, + "step": 15228 + }, + { + "epoch": 1.6724137931034484, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.1968886852264404, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.692436158657074, + "num_tokens": 393805391.0, + "step": 15229 + }, + { + "epoch": 1.672523610806062, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.76137375831604, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6984243392944336, + "num_tokens": 393825653.0, + "step": 15230 + }, + { + "epoch": 1.6726334285086755, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3437061309814453, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.694898247718811, + "num_tokens": 393852131.0, + "step": 15231 + }, + { + "epoch": 1.6727432462112892, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2193291187286377, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7145984172821045, + "num_tokens": 393879563.0, + "step": 15232 + }, + { + "epoch": 1.672853063913903, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.211927652359009, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.7003180980682373, + "num_tokens": 393908384.0, + "step": 15233 + }, + { + "epoch": 1.6729628816165167, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.6536672115325928, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7094812393188477, + "num_tokens": 393929500.0, + "step": 15234 + }, + { + "epoch": 1.6730726993191303, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4326677322387695, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7384757399559021, + "num_tokens": 393952337.0, + "step": 15235 + }, + { + "epoch": 1.6731825170217438, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2889416217803955, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6946243047714233, + "num_tokens": 393978801.0, + "step": 15236 + }, + { + "epoch": 1.6732923347243576, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.42423677444458, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7041072249412537, + "num_tokens": 394006803.0, + "step": 15237 + }, + { + "epoch": 1.6734021524269713, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5511248111724854, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.7012705206871033, + "num_tokens": 394031596.0, + "step": 15238 + }, + { + "epoch": 1.6735119701295849, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.641183614730835, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7281931638717651, + "num_tokens": 394053285.0, + "step": 15239 + }, + { + "epoch": 1.6736217878321984, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.218841075897217, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7262825965881348, + "num_tokens": 394081476.0, + "step": 15240 + }, + { + "epoch": 1.6737316055348122, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2376770973205566, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7131524085998535, + "num_tokens": 394109698.0, + "step": 15241 + }, + { + "epoch": 1.673841423237426, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3735156059265137, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7463553547859192, + "num_tokens": 394133480.0, + "step": 15242 + }, + { + "epoch": 1.6739512409400397, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.439967632293701, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7192319631576538, + "num_tokens": 394159412.0, + "step": 15243 + }, + { + "epoch": 1.6740610586426532, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.381585121154785, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7011743783950806, + "num_tokens": 394185616.0, + "step": 15244 + }, + { + "epoch": 1.6741708763452667, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2493817806243896, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7217215895652771, + "num_tokens": 394214586.0, + "step": 15245 + }, + { + "epoch": 1.6742806940478805, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.320513963699341, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7011469602584839, + "num_tokens": 394241550.0, + "step": 15246 + }, + { + "epoch": 1.6743905117504942, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.447510004043579, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7231981754302979, + "num_tokens": 394266073.0, + "step": 15247 + }, + { + "epoch": 1.674500329453108, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4023525714874268, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.7080514430999756, + "num_tokens": 394292595.0, + "step": 15248 + }, + { + "epoch": 1.6746101471557215, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.19573712348938, + "learning_rate": 1e-06, + "loss": 1.094, + "mean_token_accuracy": 0.686861515045166, + "num_tokens": 394323940.0, + "step": 15249 + }, + { + "epoch": 1.674719964858335, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4446709156036377, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6984137892723083, + "num_tokens": 394349290.0, + "step": 15250 + }, + { + "epoch": 1.6748297825609488, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.7597432136535645, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7490600347518921, + "num_tokens": 394368900.0, + "step": 15251 + }, + { + "epoch": 1.6749396002635626, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.530797004699707, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7182121276855469, + "num_tokens": 394390688.0, + "step": 15252 + }, + { + "epoch": 1.6750494179661761, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2428054809570312, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.695960283279419, + "num_tokens": 394419230.0, + "step": 15253 + }, + { + "epoch": 1.6751592356687897, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.064737319946289, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6858726739883423, + "num_tokens": 394458588.0, + "step": 15254 + }, + { + "epoch": 1.6752690533714034, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3866467475891113, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6953748464584351, + "num_tokens": 394486642.0, + "step": 15255 + }, + { + "epoch": 1.6753788710740172, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.3446099758148193, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7069345712661743, + "num_tokens": 394513361.0, + "step": 15256 + }, + { + "epoch": 1.675488688776631, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4139180183410645, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7105322480201721, + "num_tokens": 394539867.0, + "step": 15257 + }, + { + "epoch": 1.6755985064792445, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.149569511413574, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7064692974090576, + "num_tokens": 394571484.0, + "step": 15258 + }, + { + "epoch": 1.675708324181858, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.461197853088379, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7425354719161987, + "num_tokens": 394592336.0, + "step": 15259 + }, + { + "epoch": 1.6758181418844718, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4686193466186523, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7052302360534668, + "num_tokens": 394617743.0, + "step": 15260 + }, + { + "epoch": 1.6759279595870855, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4911651611328125, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7188310027122498, + "num_tokens": 394642515.0, + "step": 15261 + }, + { + "epoch": 1.676037777289699, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.525386333465576, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7054020762443542, + "num_tokens": 394665608.0, + "step": 15262 + }, + { + "epoch": 1.6761475949923128, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4594876766204834, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7169634103775024, + "num_tokens": 394690613.0, + "step": 15263 + }, + { + "epoch": 1.6762574126949263, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2184810638427734, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6919145584106445, + "num_tokens": 394720380.0, + "step": 15264 + }, + { + "epoch": 1.67636723039754, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.236664295196533, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7028976082801819, + "num_tokens": 394750186.0, + "step": 15265 + }, + { + "epoch": 1.6764770481001539, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2098355293273926, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.709662914276123, + "num_tokens": 394780990.0, + "step": 15266 + }, + { + "epoch": 1.6765868658027674, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4824252128601074, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7348772287368774, + "num_tokens": 394802607.0, + "step": 15267 + }, + { + "epoch": 1.676696683505381, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5496938228607178, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7176758050918579, + "num_tokens": 394825914.0, + "step": 15268 + }, + { + "epoch": 1.6768065012079947, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.2093405723571777, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7197393178939819, + "num_tokens": 394855125.0, + "step": 15269 + }, + { + "epoch": 1.6769163189106084, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4828994274139404, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6886712312698364, + "num_tokens": 394883159.0, + "step": 15270 + }, + { + "epoch": 1.6770261366132222, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.4532558917999268, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6940506100654602, + "num_tokens": 394909074.0, + "step": 15271 + }, + { + "epoch": 1.6771359543158357, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.494523763656616, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7092880010604858, + "num_tokens": 394932896.0, + "step": 15272 + }, + { + "epoch": 1.6772457720184493, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.5533578395843506, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6972997188568115, + "num_tokens": 394957311.0, + "step": 15273 + }, + { + "epoch": 1.677355589721063, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 2.348424196243286, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6946201920509338, + "num_tokens": 394981625.0, + "step": 15274 + }, + { + "epoch": 1.6774654074236768, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2029197216033936, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.707653284072876, + "num_tokens": 395011025.0, + "step": 15275 + }, + { + "epoch": 1.6775752251262903, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 3.0593364238739014, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7083975672721863, + "num_tokens": 395031310.0, + "step": 15276 + }, + { + "epoch": 1.677685042828904, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.039090871810913, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6964769959449768, + "num_tokens": 395064109.0, + "step": 15277 + }, + { + "epoch": 1.6777948605315176, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3407225608825684, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7009042501449585, + "num_tokens": 395092223.0, + "step": 15278 + }, + { + "epoch": 1.6779046782341314, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2431392669677734, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7090823650360107, + "num_tokens": 395121767.0, + "step": 15279 + }, + { + "epoch": 1.6780144959367451, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.434924364089966, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.730393648147583, + "num_tokens": 395145604.0, + "step": 15280 + }, + { + "epoch": 1.6781243136393587, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.27268385887146, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7098483443260193, + "num_tokens": 395172840.0, + "step": 15281 + }, + { + "epoch": 1.6782341313419722, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7713756561279297, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.762561023235321, + "num_tokens": 395192119.0, + "step": 15282 + }, + { + "epoch": 1.678343949044586, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.26245379447937, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6976566314697266, + "num_tokens": 395220787.0, + "step": 15283 + }, + { + "epoch": 1.6784537667471997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.412436008453369, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6872971057891846, + "num_tokens": 395246050.0, + "step": 15284 + }, + { + "epoch": 1.6785635844498135, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.713243007659912, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7172563076019287, + "num_tokens": 395267040.0, + "step": 15285 + }, + { + "epoch": 1.678673402152427, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3041837215423584, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7042633891105652, + "num_tokens": 395293994.0, + "step": 15286 + }, + { + "epoch": 1.6787832198550405, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4201393127441406, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7144565582275391, + "num_tokens": 395319169.0, + "step": 15287 + }, + { + "epoch": 1.6788930375576543, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.638389825820923, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7048816680908203, + "num_tokens": 395341689.0, + "step": 15288 + }, + { + "epoch": 1.679002855260268, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3067939281463623, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.692629337310791, + "num_tokens": 395369341.0, + "step": 15289 + }, + { + "epoch": 1.6791126729628816, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5176661014556885, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7094978094100952, + "num_tokens": 395391825.0, + "step": 15290 + }, + { + "epoch": 1.6792224906654951, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4069066047668457, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7068747282028198, + "num_tokens": 395416109.0, + "step": 15291 + }, + { + "epoch": 1.6793323083681089, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.456486940383911, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6931819915771484, + "num_tokens": 395440664.0, + "step": 15292 + }, + { + "epoch": 1.6794421260707226, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.622663736343384, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7329927086830139, + "num_tokens": 395460977.0, + "step": 15293 + }, + { + "epoch": 1.6795519437733364, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5598371028900146, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.685538649559021, + "num_tokens": 395486277.0, + "step": 15294 + }, + { + "epoch": 1.67966176147595, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3800106048583984, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7123998999595642, + "num_tokens": 395511382.0, + "step": 15295 + }, + { + "epoch": 1.6797715791785635, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4065165519714355, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7307540774345398, + "num_tokens": 395535975.0, + "step": 15296 + }, + { + "epoch": 1.6798813968811772, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.347212314605713, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.7007783651351929, + "num_tokens": 395561926.0, + "step": 15297 + }, + { + "epoch": 1.679991214583791, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2763917446136475, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.686857283115387, + "num_tokens": 395589193.0, + "step": 15298 + }, + { + "epoch": 1.6801010322864047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1497349739074707, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7007531523704529, + "num_tokens": 395622093.0, + "step": 15299 + }, + { + "epoch": 1.6802108499890183, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3367044925689697, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7282165288925171, + "num_tokens": 395649011.0, + "step": 15300 + }, + { + "epoch": 1.6803206676916318, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.430727243423462, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7200107574462891, + "num_tokens": 395673215.0, + "step": 15301 + }, + { + "epoch": 1.6804304853942456, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3390755653381348, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6972674131393433, + "num_tokens": 395699330.0, + "step": 15302 + }, + { + "epoch": 1.6805403030968593, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.443218946456909, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7189191579818726, + "num_tokens": 395723199.0, + "step": 15303 + }, + { + "epoch": 1.6806501207994728, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.378833770751953, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7010813355445862, + "num_tokens": 395748689.0, + "step": 15304 + }, + { + "epoch": 1.6807599385020864, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3260462284088135, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7325415015220642, + "num_tokens": 395775038.0, + "step": 15305 + }, + { + "epoch": 1.6808697562047001, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.214694023132324, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7391409873962402, + "num_tokens": 395803491.0, + "step": 15306 + }, + { + "epoch": 1.680979573907314, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.23653507232666, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7252038717269897, + "num_tokens": 395829483.0, + "step": 15307 + }, + { + "epoch": 1.6810893916099277, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.649266481399536, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7095745205879211, + "num_tokens": 395849479.0, + "step": 15308 + }, + { + "epoch": 1.6811992093125412, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4417858123779297, + "learning_rate": 1e-06, + "loss": 1.0916, + "mean_token_accuracy": 0.6840740442276001, + "num_tokens": 395877741.0, + "step": 15309 + }, + { + "epoch": 1.6813090270151547, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4905498027801514, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7206965088844299, + "num_tokens": 395900303.0, + "step": 15310 + }, + { + "epoch": 1.6814188447177685, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.246887683868408, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7097976207733154, + "num_tokens": 395928543.0, + "step": 15311 + }, + { + "epoch": 1.6815286624203822, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.269373655319214, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7209525108337402, + "num_tokens": 395954915.0, + "step": 15312 + }, + { + "epoch": 1.681638480122996, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4534990787506104, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7381646633148193, + "num_tokens": 395977319.0, + "step": 15313 + }, + { + "epoch": 1.6817482978256095, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5069732666015625, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.70725417137146, + "num_tokens": 396000471.0, + "step": 15314 + }, + { + "epoch": 1.681858115528223, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.147756338119507, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.7048689723014832, + "num_tokens": 396033624.0, + "step": 15315 + }, + { + "epoch": 1.6819679332308368, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3985302448272705, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.710017204284668, + "num_tokens": 396060174.0, + "step": 15316 + }, + { + "epoch": 1.6820777509334506, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.676884651184082, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.701766848564148, + "num_tokens": 396081819.0, + "step": 15317 + }, + { + "epoch": 1.6821875686360641, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2766807079315186, + "learning_rate": 1e-06, + "loss": 1.1293, + "mean_token_accuracy": 0.6700012683868408, + "num_tokens": 396112089.0, + "step": 15318 + }, + { + "epoch": 1.6822973863386776, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6009011268615723, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7188828587532043, + "num_tokens": 396134208.0, + "step": 15319 + }, + { + "epoch": 1.6824072040412914, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3505606651306152, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6775857210159302, + "num_tokens": 396165284.0, + "step": 15320 + }, + { + "epoch": 1.6825170217439052, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.804445743560791, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7109411358833313, + "num_tokens": 396185911.0, + "step": 15321 + }, + { + "epoch": 1.682626839446519, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.538526773452759, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7128545045852661, + "num_tokens": 396208581.0, + "step": 15322 + }, + { + "epoch": 1.6827366571491325, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1915571689605713, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7288717031478882, + "num_tokens": 396235996.0, + "step": 15323 + }, + { + "epoch": 1.682846474851746, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.467822551727295, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6987078189849854, + "num_tokens": 396259865.0, + "step": 15324 + }, + { + "epoch": 1.6829562925543597, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.297508478164673, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.7035809755325317, + "num_tokens": 396288236.0, + "step": 15325 + }, + { + "epoch": 1.6830661102569735, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6821510791778564, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7189851999282837, + "num_tokens": 396310576.0, + "step": 15326 + }, + { + "epoch": 1.683175927959587, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5297396183013916, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7098698616027832, + "num_tokens": 396334790.0, + "step": 15327 + }, + { + "epoch": 1.6832857456622008, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3529419898986816, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7040000557899475, + "num_tokens": 396362040.0, + "step": 15328 + }, + { + "epoch": 1.6833955633648143, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.623873472213745, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6845009326934814, + "num_tokens": 396385734.0, + "step": 15329 + }, + { + "epoch": 1.683505381067428, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.281203269958496, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7162445783615112, + "num_tokens": 396410754.0, + "step": 15330 + }, + { + "epoch": 1.6836151987700418, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.282352924346924, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6890966296195984, + "num_tokens": 396440831.0, + "step": 15331 + }, + { + "epoch": 1.6837250164726554, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.434255838394165, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6895782351493835, + "num_tokens": 396466952.0, + "step": 15332 + }, + { + "epoch": 1.683834834175269, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2874033451080322, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7135927081108093, + "num_tokens": 396496122.0, + "step": 15333 + }, + { + "epoch": 1.6839446518778827, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4147627353668213, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7106038331985474, + "num_tokens": 396518979.0, + "step": 15334 + }, + { + "epoch": 1.6840544695804964, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2518553733825684, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7070848345756531, + "num_tokens": 396549076.0, + "step": 15335 + }, + { + "epoch": 1.6841642872831102, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2611522674560547, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7011529207229614, + "num_tokens": 396579410.0, + "step": 15336 + }, + { + "epoch": 1.6842741049857237, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.360419750213623, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7132993936538696, + "num_tokens": 396605417.0, + "step": 15337 + }, + { + "epoch": 1.6843839226883373, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4314606189727783, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7138417959213257, + "num_tokens": 396629173.0, + "step": 15338 + }, + { + "epoch": 1.684493740390951, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3507330417633057, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7039604783058167, + "num_tokens": 396657129.0, + "step": 15339 + }, + { + "epoch": 1.6846035580935648, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.586771249771118, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7277918457984924, + "num_tokens": 396678250.0, + "step": 15340 + }, + { + "epoch": 1.6847133757961783, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3436672687530518, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7267811298370361, + "num_tokens": 396702668.0, + "step": 15341 + }, + { + "epoch": 1.684823193498792, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.469872236251831, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7182480096817017, + "num_tokens": 396727824.0, + "step": 15342 + }, + { + "epoch": 1.6849330112014056, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3026793003082275, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7088356018066406, + "num_tokens": 396756753.0, + "step": 15343 + }, + { + "epoch": 1.6850428289040194, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5072953701019287, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.711773157119751, + "num_tokens": 396780860.0, + "step": 15344 + }, + { + "epoch": 1.685152646606633, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.572892189025879, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6944683790206909, + "num_tokens": 396804809.0, + "step": 15345 + }, + { + "epoch": 1.6852624643092466, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.551032066345215, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7304832935333252, + "num_tokens": 396827140.0, + "step": 15346 + }, + { + "epoch": 1.6853722820118602, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.458219528198242, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.705417275428772, + "num_tokens": 396851950.0, + "step": 15347 + }, + { + "epoch": 1.685482099714474, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3027026653289795, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7238258123397827, + "num_tokens": 396876087.0, + "step": 15348 + }, + { + "epoch": 1.6855919174170877, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1629037857055664, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7027570009231567, + "num_tokens": 396905161.0, + "step": 15349 + }, + { + "epoch": 1.6857017351197014, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.587143898010254, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7100641131401062, + "num_tokens": 396928353.0, + "step": 15350 + }, + { + "epoch": 1.685811552822315, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4342284202575684, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.715406060218811, + "num_tokens": 396952381.0, + "step": 15351 + }, + { + "epoch": 1.6859213705249285, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3725132942199707, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.680021345615387, + "num_tokens": 396981463.0, + "step": 15352 + }, + { + "epoch": 1.6860311882275423, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3900814056396484, + "learning_rate": 1e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.6820188164710999, + "num_tokens": 397008751.0, + "step": 15353 + }, + { + "epoch": 1.686141005930156, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6471898555755615, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7124528288841248, + "num_tokens": 397030367.0, + "step": 15354 + }, + { + "epoch": 1.6862508236327696, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.072406053543091, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7070252895355225, + "num_tokens": 397063919.0, + "step": 15355 + }, + { + "epoch": 1.686360641335383, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.334228754043579, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7142168879508972, + "num_tokens": 397090232.0, + "step": 15356 + }, + { + "epoch": 1.6864704590379969, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5245745182037354, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7237489223480225, + "num_tokens": 397111520.0, + "step": 15357 + }, + { + "epoch": 1.6865802767406106, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.075796365737915, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6937876343727112, + "num_tokens": 397144790.0, + "step": 15358 + }, + { + "epoch": 1.6866900944432244, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2955427169799805, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.7099316716194153, + "num_tokens": 397171378.0, + "step": 15359 + }, + { + "epoch": 1.686799912145838, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5228278636932373, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7155336141586304, + "num_tokens": 397192336.0, + "step": 15360 + }, + { + "epoch": 1.6869097298484514, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.379337787628174, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7318439483642578, + "num_tokens": 397217011.0, + "step": 15361 + }, + { + "epoch": 1.6870195475510652, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.66321063041687, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7004613876342773, + "num_tokens": 397236937.0, + "step": 15362 + }, + { + "epoch": 1.687129365253679, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4998199939727783, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7042607069015503, + "num_tokens": 397262375.0, + "step": 15363 + }, + { + "epoch": 1.6872391829562927, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.275540351867676, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7217514514923096, + "num_tokens": 397287748.0, + "step": 15364 + }, + { + "epoch": 1.6873490006589063, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1534600257873535, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6917538642883301, + "num_tokens": 397318339.0, + "step": 15365 + }, + { + "epoch": 1.6874588183615198, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.684650182723999, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7302680611610413, + "num_tokens": 397339436.0, + "step": 15366 + }, + { + "epoch": 1.6875686360641335, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3540642261505127, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7229576706886292, + "num_tokens": 397364289.0, + "step": 15367 + }, + { + "epoch": 1.6876784537667473, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.347606897354126, + "learning_rate": 1e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.6829429864883423, + "num_tokens": 397390891.0, + "step": 15368 + }, + { + "epoch": 1.6877882714693608, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4161674976348877, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7156202793121338, + "num_tokens": 397415729.0, + "step": 15369 + }, + { + "epoch": 1.6878980891719744, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2192177772521973, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.705917239189148, + "num_tokens": 397444613.0, + "step": 15370 + }, + { + "epoch": 1.6880079068745881, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.370229959487915, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7136858105659485, + "num_tokens": 397470442.0, + "step": 15371 + }, + { + "epoch": 1.6881177245772019, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4268393516540527, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7233587503433228, + "num_tokens": 397493162.0, + "step": 15372 + }, + { + "epoch": 1.6882275422798156, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.9818925857543945, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6937693357467651, + "num_tokens": 397527873.0, + "step": 15373 + }, + { + "epoch": 1.6883373599824292, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.416322708129883, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7285860776901245, + "num_tokens": 397551717.0, + "step": 15374 + }, + { + "epoch": 1.6884471776850427, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4102025032043457, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7150055170059204, + "num_tokens": 397576725.0, + "step": 15375 + }, + { + "epoch": 1.6885569953876565, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5295746326446533, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7145982384681702, + "num_tokens": 397597859.0, + "step": 15376 + }, + { + "epoch": 1.6886668130902702, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6326382160186768, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.707390546798706, + "num_tokens": 397620983.0, + "step": 15377 + }, + { + "epoch": 1.6887766307928838, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.094572067260742, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7166858315467834, + "num_tokens": 397654202.0, + "step": 15378 + }, + { + "epoch": 1.6888864484954975, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.172956705093384, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7259213328361511, + "num_tokens": 397684205.0, + "step": 15379 + }, + { + "epoch": 1.688996266198111, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4882490634918213, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.713652491569519, + "num_tokens": 397709024.0, + "step": 15380 + }, + { + "epoch": 1.6891060839007248, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.332974910736084, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7253640294075012, + "num_tokens": 397734698.0, + "step": 15381 + }, + { + "epoch": 1.6892159016033386, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.446129560470581, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7161106467247009, + "num_tokens": 397757587.0, + "step": 15382 + }, + { + "epoch": 1.689325719305952, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.730881452560425, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7264971733093262, + "num_tokens": 397776130.0, + "step": 15383 + }, + { + "epoch": 1.6894355370085656, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.352715492248535, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7136081457138062, + "num_tokens": 397801847.0, + "step": 15384 + }, + { + "epoch": 1.6895453547111794, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.187720775604248, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7216305732727051, + "num_tokens": 397830107.0, + "step": 15385 + }, + { + "epoch": 1.6896551724137931, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 32.33726119995117, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6993045806884766, + "num_tokens": 397856490.0, + "step": 15386 + }, + { + "epoch": 1.689764990116407, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2490599155426025, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.70225590467453, + "num_tokens": 397886156.0, + "step": 15387 + }, + { + "epoch": 1.6898748078190204, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.402613401412964, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7277812361717224, + "num_tokens": 397910565.0, + "step": 15388 + }, + { + "epoch": 1.689984625521634, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.657047748565674, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7325084805488586, + "num_tokens": 397931254.0, + "step": 15389 + }, + { + "epoch": 1.6900944432242477, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2485127449035645, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7121171355247498, + "num_tokens": 397960044.0, + "step": 15390 + }, + { + "epoch": 1.6902042609268615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.366595506668091, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6981756687164307, + "num_tokens": 397988078.0, + "step": 15391 + }, + { + "epoch": 1.690314078629475, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.430896520614624, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.694048285484314, + "num_tokens": 398014185.0, + "step": 15392 + }, + { + "epoch": 1.6904238963320888, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.236875295639038, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7099631428718567, + "num_tokens": 398041080.0, + "step": 15393 + }, + { + "epoch": 1.6905337140347023, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4362051486968994, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7166597843170166, + "num_tokens": 398066396.0, + "step": 15394 + }, + { + "epoch": 1.690643531737316, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.44319748878479, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7136563062667847, + "num_tokens": 398091231.0, + "step": 15395 + }, + { + "epoch": 1.6907533494399298, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.565192461013794, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.711962878704071, + "num_tokens": 398113988.0, + "step": 15396 + }, + { + "epoch": 1.6908631671425434, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3701772689819336, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6968722939491272, + "num_tokens": 398139097.0, + "step": 15397 + }, + { + "epoch": 1.690972984845157, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.35784912109375, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7223491668701172, + "num_tokens": 398166310.0, + "step": 15398 + }, + { + "epoch": 1.6910828025477707, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5601940155029297, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7240825891494751, + "num_tokens": 398188479.0, + "step": 15399 + }, + { + "epoch": 1.6911926202503844, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.344449996948242, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7142668962478638, + "num_tokens": 398214429.0, + "step": 15400 + }, + { + "epoch": 1.6913024379529982, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.667325019836426, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7035008668899536, + "num_tokens": 398246406.0, + "step": 15401 + }, + { + "epoch": 1.6914122556556117, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.30558443069458, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7072814702987671, + "num_tokens": 398273894.0, + "step": 15402 + }, + { + "epoch": 1.6915220733582252, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6270408630371094, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7159479260444641, + "num_tokens": 398295936.0, + "step": 15403 + }, + { + "epoch": 1.691631891060839, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.365125894546509, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7040554285049438, + "num_tokens": 398322470.0, + "step": 15404 + }, + { + "epoch": 1.6917417087634528, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4086532592773438, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6951128244400024, + "num_tokens": 398347501.0, + "step": 15405 + }, + { + "epoch": 1.6918515264660663, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.249810218811035, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.7065224647521973, + "num_tokens": 398377375.0, + "step": 15406 + }, + { + "epoch": 1.6919613441686798, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7642815113067627, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7207669615745544, + "num_tokens": 398397998.0, + "step": 15407 + }, + { + "epoch": 1.6920711618712936, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.986713171005249, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7010255455970764, + "num_tokens": 398433069.0, + "step": 15408 + }, + { + "epoch": 1.6921809795739073, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.633636236190796, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7220094799995422, + "num_tokens": 398455663.0, + "step": 15409 + }, + { + "epoch": 1.692290797276521, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.589097261428833, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7099928855895996, + "num_tokens": 398478737.0, + "step": 15410 + }, + { + "epoch": 1.6924006149791346, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2349660396575928, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7192668914794922, + "num_tokens": 398506106.0, + "step": 15411 + }, + { + "epoch": 1.6925104326817482, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5707216262817383, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7415671944618225, + "num_tokens": 398526706.0, + "step": 15412 + }, + { + "epoch": 1.692620250384362, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2135136127471924, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6832425594329834, + "num_tokens": 398558623.0, + "step": 15413 + }, + { + "epoch": 1.6927300680869757, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.613656759262085, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.726162314414978, + "num_tokens": 398580334.0, + "step": 15414 + }, + { + "epoch": 1.6928398857895894, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.669036626815796, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7430678009986877, + "num_tokens": 398601379.0, + "step": 15415 + }, + { + "epoch": 1.692949703492203, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.34088397026062, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7312193512916565, + "num_tokens": 398629504.0, + "step": 15416 + }, + { + "epoch": 1.6930595211948165, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4526450634002686, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6858631372451782, + "num_tokens": 398655967.0, + "step": 15417 + }, + { + "epoch": 1.6931693388974303, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.619692802429199, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7047067284584045, + "num_tokens": 398677510.0, + "step": 15418 + }, + { + "epoch": 1.693279156600044, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 8.421514511108398, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7042706608772278, + "num_tokens": 398705020.0, + "step": 15419 + }, + { + "epoch": 1.6933889743026576, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.550356864929199, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7103820443153381, + "num_tokens": 398730620.0, + "step": 15420 + }, + { + "epoch": 1.693498792005271, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.494739294052124, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7338793277740479, + "num_tokens": 398752590.0, + "step": 15421 + }, + { + "epoch": 1.6936086097078848, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.443856716156006, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7389769554138184, + "num_tokens": 398775873.0, + "step": 15422 + }, + { + "epoch": 1.6937184274104986, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.481983184814453, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7191908359527588, + "num_tokens": 398798566.0, + "step": 15423 + }, + { + "epoch": 1.6938282451131124, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5406248569488525, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.706963837146759, + "num_tokens": 398822146.0, + "step": 15424 + }, + { + "epoch": 1.693938062815726, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2722327709198, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6880807876586914, + "num_tokens": 398851007.0, + "step": 15425 + }, + { + "epoch": 1.6940478805183394, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4637253284454346, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7182737588882446, + "num_tokens": 398874032.0, + "step": 15426 + }, + { + "epoch": 1.6941576982209532, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.656623601913452, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7058813571929932, + "num_tokens": 398907392.0, + "step": 15427 + }, + { + "epoch": 1.694267515923567, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4249629974365234, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6954176425933838, + "num_tokens": 398932488.0, + "step": 15428 + }, + { + "epoch": 1.6943773336261807, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4501376152038574, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7324316501617432, + "num_tokens": 398955318.0, + "step": 15429 + }, + { + "epoch": 1.6944871513287942, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.9728877544403076, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7306389212608337, + "num_tokens": 398973510.0, + "step": 15430 + }, + { + "epoch": 1.6945969690314078, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3937454223632812, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7216885089874268, + "num_tokens": 398998123.0, + "step": 15431 + }, + { + "epoch": 1.6947067867340215, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5816357135772705, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.719971776008606, + "num_tokens": 399021861.0, + "step": 15432 + }, + { + "epoch": 1.6948166044366353, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.638728618621826, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7272270917892456, + "num_tokens": 399042557.0, + "step": 15433 + }, + { + "epoch": 1.6949264221392488, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1912717819213867, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.722334623336792, + "num_tokens": 399072586.0, + "step": 15434 + }, + { + "epoch": 1.6950362398418624, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.598137140274048, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6915642619132996, + "num_tokens": 399099061.0, + "step": 15435 + }, + { + "epoch": 1.6951460575444761, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7600648403167725, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7128406763076782, + "num_tokens": 399117783.0, + "step": 15436 + }, + { + "epoch": 1.6952558752470899, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.48665714263916, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7111868858337402, + "num_tokens": 399141358.0, + "step": 15437 + }, + { + "epoch": 1.6953656929497036, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.304452657699585, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6873621940612793, + "num_tokens": 399169370.0, + "step": 15438 + }, + { + "epoch": 1.6954755106523172, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2774808406829834, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7170130610466003, + "num_tokens": 399197959.0, + "step": 15439 + }, + { + "epoch": 1.6955853283549307, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.609379768371582, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7099736332893372, + "num_tokens": 399220990.0, + "step": 15440 + }, + { + "epoch": 1.6956951460575445, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3559820652008057, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.699851393699646, + "num_tokens": 399246665.0, + "step": 15441 + }, + { + "epoch": 1.6958049637601582, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.15826153755188, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6872574687004089, + "num_tokens": 399276337.0, + "step": 15442 + }, + { + "epoch": 1.6959147814627717, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4471118450164795, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7192249298095703, + "num_tokens": 399298346.0, + "step": 15443 + }, + { + "epoch": 1.6960245991653855, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4950671195983887, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.701240062713623, + "num_tokens": 399323963.0, + "step": 15444 + }, + { + "epoch": 1.696134416867999, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7206850051879883, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7155043482780457, + "num_tokens": 399344041.0, + "step": 15445 + }, + { + "epoch": 1.6962442345706128, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6965227127075195, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7073646783828735, + "num_tokens": 399365154.0, + "step": 15446 + }, + { + "epoch": 1.6963540522732266, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3745224475860596, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7193949818611145, + "num_tokens": 399390134.0, + "step": 15447 + }, + { + "epoch": 1.69646386997584, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3783881664276123, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7152292728424072, + "num_tokens": 399414294.0, + "step": 15448 + }, + { + "epoch": 1.6965736876784536, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.427097797393799, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6839520335197449, + "num_tokens": 399442632.0, + "step": 15449 + }, + { + "epoch": 1.6966835053810674, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3091228008270264, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6787887811660767, + "num_tokens": 399471270.0, + "step": 15450 + }, + { + "epoch": 1.6967933230836811, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.464954137802124, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7153613567352295, + "num_tokens": 399495752.0, + "step": 15451 + }, + { + "epoch": 1.696903140786295, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.302867889404297, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7022605538368225, + "num_tokens": 399523581.0, + "step": 15452 + }, + { + "epoch": 1.6970129584889084, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3212597370147705, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.7006288766860962, + "num_tokens": 399550092.0, + "step": 15453 + }, + { + "epoch": 1.697122776191522, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1383767127990723, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7280822992324829, + "num_tokens": 399579278.0, + "step": 15454 + }, + { + "epoch": 1.6972325938941357, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0478360652923584, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7121154069900513, + "num_tokens": 399610000.0, + "step": 15455 + }, + { + "epoch": 1.6973424115967495, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2670819759368896, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7288920879364014, + "num_tokens": 399635600.0, + "step": 15456 + }, + { + "epoch": 1.697452229299363, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4445037841796875, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7116425037384033, + "num_tokens": 399660664.0, + "step": 15457 + }, + { + "epoch": 1.6975620470019768, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5497517585754395, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6944623589515686, + "num_tokens": 399687216.0, + "step": 15458 + }, + { + "epoch": 1.6976718647045903, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4446651935577393, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6908352375030518, + "num_tokens": 399711844.0, + "step": 15459 + }, + { + "epoch": 1.697781682407204, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1718952655792236, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6849461197853088, + "num_tokens": 399743060.0, + "step": 15460 + }, + { + "epoch": 1.6978915001098178, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.283271312713623, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6976120471954346, + "num_tokens": 399769595.0, + "step": 15461 + }, + { + "epoch": 1.6980013178124314, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4791526794433594, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7084276676177979, + "num_tokens": 399794162.0, + "step": 15462 + }, + { + "epoch": 1.698111135515045, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4144370555877686, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6891717910766602, + "num_tokens": 399820012.0, + "step": 15463 + }, + { + "epoch": 1.6982209532176586, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2954354286193848, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7025808691978455, + "num_tokens": 399846566.0, + "step": 15464 + }, + { + "epoch": 1.6983307709202724, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3080646991729736, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7361828684806824, + "num_tokens": 399872193.0, + "step": 15465 + }, + { + "epoch": 1.6984405886228862, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.802170515060425, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7068442106246948, + "num_tokens": 399897730.0, + "step": 15466 + }, + { + "epoch": 1.6985504063254997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3402960300445557, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6980257630348206, + "num_tokens": 399926421.0, + "step": 15467 + }, + { + "epoch": 1.6986602240281132, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6323399543762207, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7126505374908447, + "num_tokens": 399946819.0, + "step": 15468 + }, + { + "epoch": 1.698770041730727, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4591970443725586, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7188777923583984, + "num_tokens": 399970656.0, + "step": 15469 + }, + { + "epoch": 1.6988798594333407, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2024290561676025, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7110673189163208, + "num_tokens": 399998962.0, + "step": 15470 + }, + { + "epoch": 1.6989896771359543, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.241877794265747, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.713630199432373, + "num_tokens": 400029421.0, + "step": 15471 + }, + { + "epoch": 1.6990994948385678, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.591820001602173, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6977180242538452, + "num_tokens": 400053607.0, + "step": 15472 + }, + { + "epoch": 1.6992093125411816, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.269500732421875, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7224447727203369, + "num_tokens": 400080343.0, + "step": 15473 + }, + { + "epoch": 1.6993191302437953, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.280339479446411, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6978817582130432, + "num_tokens": 400109965.0, + "step": 15474 + }, + { + "epoch": 1.699428947946409, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.557126045227051, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7157291769981384, + "num_tokens": 400132559.0, + "step": 15475 + }, + { + "epoch": 1.6995387656490226, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.235081195831299, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7167239189147949, + "num_tokens": 400159520.0, + "step": 15476 + }, + { + "epoch": 1.6996485833516362, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.533740758895874, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.701941967010498, + "num_tokens": 400182574.0, + "step": 15477 + }, + { + "epoch": 1.69975840105425, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2393627166748047, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7005889415740967, + "num_tokens": 400212425.0, + "step": 15478 + }, + { + "epoch": 1.6998682187568637, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4302213191986084, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.721550703048706, + "num_tokens": 400234508.0, + "step": 15479 + }, + { + "epoch": 1.6999780364594774, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.248772382736206, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7183306813240051, + "num_tokens": 400261336.0, + "step": 15480 + }, + { + "epoch": 1.700087854162091, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.562781810760498, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7099378108978271, + "num_tokens": 400284616.0, + "step": 15481 + }, + { + "epoch": 1.7001976718647045, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3800876140594482, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6998453140258789, + "num_tokens": 400311134.0, + "step": 15482 + }, + { + "epoch": 1.7003074895673183, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.500087022781372, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7173863053321838, + "num_tokens": 400333231.0, + "step": 15483 + }, + { + "epoch": 1.700417307269932, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2173383235931396, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6854120492935181, + "num_tokens": 400366946.0, + "step": 15484 + }, + { + "epoch": 1.7005271249725455, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.416076183319092, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7154088616371155, + "num_tokens": 400391020.0, + "step": 15485 + }, + { + "epoch": 1.700636942675159, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.25829815864563, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7039797902107239, + "num_tokens": 400420854.0, + "step": 15486 + }, + { + "epoch": 1.7007467603777728, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4380602836608887, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7189291715621948, + "num_tokens": 400445400.0, + "step": 15487 + }, + { + "epoch": 1.7008565780803866, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4949967861175537, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6941616535186768, + "num_tokens": 400472258.0, + "step": 15488 + }, + { + "epoch": 1.7009663957830004, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.440402030944824, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7192238569259644, + "num_tokens": 400495777.0, + "step": 15489 + }, + { + "epoch": 1.7010762134856139, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4436497688293457, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6999508142471313, + "num_tokens": 400523789.0, + "step": 15490 + }, + { + "epoch": 1.7011860311882274, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5643961429595947, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7217519283294678, + "num_tokens": 400548564.0, + "step": 15491 + }, + { + "epoch": 1.7012958488908412, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.399125576019287, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7001765966415405, + "num_tokens": 400572430.0, + "step": 15492 + }, + { + "epoch": 1.701405666593455, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2659926414489746, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7114815711975098, + "num_tokens": 400600268.0, + "step": 15493 + }, + { + "epoch": 1.7015154842960687, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5703868865966797, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7281550168991089, + "num_tokens": 400622347.0, + "step": 15494 + }, + { + "epoch": 1.7016253019986822, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.9352524280548096, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7244876623153687, + "num_tokens": 400640295.0, + "step": 15495 + }, + { + "epoch": 1.7017351197012958, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8547229766845703, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7092360258102417, + "num_tokens": 400657890.0, + "step": 15496 + }, + { + "epoch": 1.7018449374039095, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.416041612625122, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7113102674484253, + "num_tokens": 400682174.0, + "step": 15497 + }, + { + "epoch": 1.7019547551065233, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.387756586074829, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7166709899902344, + "num_tokens": 400707258.0, + "step": 15498 + }, + { + "epoch": 1.7020645728091368, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2926628589630127, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.683672308921814, + "num_tokens": 400736267.0, + "step": 15499 + }, + { + "epoch": 1.7021743905117503, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6480484008789062, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.709804892539978, + "num_tokens": 400758113.0, + "step": 15500 + }, + { + "epoch": 1.702284208214364, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7486038208007812, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7269593477249146, + "num_tokens": 400779398.0, + "step": 15501 + }, + { + "epoch": 1.7023940259169779, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4287109375, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7216055989265442, + "num_tokens": 400803902.0, + "step": 15502 + }, + { + "epoch": 1.7025038436195916, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3626112937927246, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.709242582321167, + "num_tokens": 400828366.0, + "step": 15503 + }, + { + "epoch": 1.7026136613222052, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.212846279144287, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6996097564697266, + "num_tokens": 400857298.0, + "step": 15504 + }, + { + "epoch": 1.7027234790248187, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.815011501312256, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7195174098014832, + "num_tokens": 400878272.0, + "step": 15505 + }, + { + "epoch": 1.7028332967274324, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.393930673599243, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7084776163101196, + "num_tokens": 400904791.0, + "step": 15506 + }, + { + "epoch": 1.7029431144300462, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1632702350616455, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7333321571350098, + "num_tokens": 400931359.0, + "step": 15507 + }, + { + "epoch": 1.7030529321326597, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.9777244329452515, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6989860534667969, + "num_tokens": 400968802.0, + "step": 15508 + }, + { + "epoch": 1.7031627498352735, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.45312237739563, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7037414312362671, + "num_tokens": 400993412.0, + "step": 15509 + }, + { + "epoch": 1.703272567537887, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5082926750183105, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7092933654785156, + "num_tokens": 401016039.0, + "step": 15510 + }, + { + "epoch": 1.7033823852405008, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4683759212493896, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7187713980674744, + "num_tokens": 401037545.0, + "step": 15511 + }, + { + "epoch": 1.7034922029431145, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.332977771759033, + "learning_rate": 1e-06, + "loss": 1.1184, + "mean_token_accuracy": 0.6713560819625854, + "num_tokens": 401064993.0, + "step": 15512 + }, + { + "epoch": 1.703602020645728, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.299006700515747, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7186787724494934, + "num_tokens": 401091084.0, + "step": 15513 + }, + { + "epoch": 1.7037118383483416, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1136956214904785, + "learning_rate": 1e-06, + "loss": 1.0871, + "mean_token_accuracy": 0.6872545480728149, + "num_tokens": 401122728.0, + "step": 15514 + }, + { + "epoch": 1.7038216560509554, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7810943126678467, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.707278847694397, + "num_tokens": 401142675.0, + "step": 15515 + }, + { + "epoch": 1.7039314737535691, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.309030532836914, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6967834830284119, + "num_tokens": 401170929.0, + "step": 15516 + }, + { + "epoch": 1.7040412914561829, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3672969341278076, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6911009550094604, + "num_tokens": 401198278.0, + "step": 15517 + }, + { + "epoch": 1.7041511091587964, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2945456504821777, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7115296125411987, + "num_tokens": 401225270.0, + "step": 15518 + }, + { + "epoch": 1.70426092686141, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3298230171203613, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7292307615280151, + "num_tokens": 401249881.0, + "step": 15519 + }, + { + "epoch": 1.7043707445640237, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.483241558074951, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7147662043571472, + "num_tokens": 401272146.0, + "step": 15520 + }, + { + "epoch": 1.7044805622666375, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.56758713722229, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7160286903381348, + "num_tokens": 401293937.0, + "step": 15521 + }, + { + "epoch": 1.704590379969251, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.418550968170166, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7279776334762573, + "num_tokens": 401317738.0, + "step": 15522 + }, + { + "epoch": 1.7047001976718648, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3529136180877686, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6763409972190857, + "num_tokens": 401348387.0, + "step": 15523 + }, + { + "epoch": 1.7048100153744783, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2000958919525146, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7109055519104004, + "num_tokens": 401381340.0, + "step": 15524 + }, + { + "epoch": 1.704919833077092, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.590071201324463, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7145691514015198, + "num_tokens": 401402595.0, + "step": 15525 + }, + { + "epoch": 1.7050296507797058, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3896727561950684, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7210242748260498, + "num_tokens": 401425200.0, + "step": 15526 + }, + { + "epoch": 1.7051394684823193, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4668309688568115, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6979788541793823, + "num_tokens": 401448836.0, + "step": 15527 + }, + { + "epoch": 1.7052492861849329, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.606516122817993, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7168997526168823, + "num_tokens": 401470107.0, + "step": 15528 + }, + { + "epoch": 1.7053591038875466, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.324134349822998, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.69575035572052, + "num_tokens": 401497024.0, + "step": 15529 + }, + { + "epoch": 1.7054689215901604, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.238910675048828, + "learning_rate": 1e-06, + "loss": 1.113, + "mean_token_accuracy": 0.6708979606628418, + "num_tokens": 401529644.0, + "step": 15530 + }, + { + "epoch": 1.7055787392927741, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4646921157836914, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7066912055015564, + "num_tokens": 401553693.0, + "step": 15531 + }, + { + "epoch": 1.7056885569953877, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4033403396606445, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7065958976745605, + "num_tokens": 401577197.0, + "step": 15532 + }, + { + "epoch": 1.7057983746980012, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1889474391937256, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6867420673370361, + "num_tokens": 401608064.0, + "step": 15533 + }, + { + "epoch": 1.705908192400615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6737773418426514, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7328047752380371, + "num_tokens": 401628407.0, + "step": 15534 + }, + { + "epoch": 1.7060180101032287, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3290953636169434, + "learning_rate": 1e-06, + "loss": 1.1075, + "mean_token_accuracy": 0.6806694269180298, + "num_tokens": 401656515.0, + "step": 15535 + }, + { + "epoch": 1.7061278278058423, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4611024856567383, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7129467129707336, + "num_tokens": 401681436.0, + "step": 15536 + }, + { + "epoch": 1.7062376455084558, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4903059005737305, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7087631225585938, + "num_tokens": 401706904.0, + "step": 15537 + }, + { + "epoch": 1.7063474632110696, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5405819416046143, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.719954788684845, + "num_tokens": 401730977.0, + "step": 15538 + }, + { + "epoch": 1.7064572809136833, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3034911155700684, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7232480049133301, + "num_tokens": 401757877.0, + "step": 15539 + }, + { + "epoch": 1.706567098616297, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.34479022026062, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7171996831893921, + "num_tokens": 401784118.0, + "step": 15540 + }, + { + "epoch": 1.7066769163189106, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3504528999328613, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7462135553359985, + "num_tokens": 401809143.0, + "step": 15541 + }, + { + "epoch": 1.7067867340215241, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.61647629737854, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7054053544998169, + "num_tokens": 401830948.0, + "step": 15542 + }, + { + "epoch": 1.706896551724138, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5525529384613037, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7205449342727661, + "num_tokens": 401853041.0, + "step": 15543 + }, + { + "epoch": 1.7070063694267517, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2082786560058594, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7004448771476746, + "num_tokens": 401883021.0, + "step": 15544 + }, + { + "epoch": 1.7071161871293654, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.543065071105957, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7058967351913452, + "num_tokens": 401905419.0, + "step": 15545 + }, + { + "epoch": 1.707226004831979, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3098304271698, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6938117146492004, + "num_tokens": 401934862.0, + "step": 15546 + }, + { + "epoch": 1.7073358225345925, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1041197776794434, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6836464405059814, + "num_tokens": 401965412.0, + "step": 15547 + }, + { + "epoch": 1.7074456402372062, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.420708179473877, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6950233578681946, + "num_tokens": 401990219.0, + "step": 15548 + }, + { + "epoch": 1.70755545793982, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.645369052886963, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7160934209823608, + "num_tokens": 402010596.0, + "step": 15549 + }, + { + "epoch": 1.7076652756424335, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4091267585754395, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7150850296020508, + "num_tokens": 402035555.0, + "step": 15550 + }, + { + "epoch": 1.707775093345047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.234551429748535, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6969732642173767, + "num_tokens": 402064140.0, + "step": 15551 + }, + { + "epoch": 1.7078849110476608, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5731968879699707, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7118402719497681, + "num_tokens": 402085461.0, + "step": 15552 + }, + { + "epoch": 1.7079947287502746, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3034160137176514, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7164721488952637, + "num_tokens": 402113389.0, + "step": 15553 + }, + { + "epoch": 1.7081045464528883, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5187878608703613, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.709291398525238, + "num_tokens": 402134794.0, + "step": 15554 + }, + { + "epoch": 1.7082143641555019, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.320225715637207, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6946849822998047, + "num_tokens": 402162204.0, + "step": 15555 + }, + { + "epoch": 1.7083241818581154, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.82592511177063, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7229734659194946, + "num_tokens": 402180576.0, + "step": 15556 + }, + { + "epoch": 1.7084339995607292, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.448305368423462, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7324156165122986, + "num_tokens": 402202089.0, + "step": 15557 + }, + { + "epoch": 1.708543817263343, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6165552139282227, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7316210269927979, + "num_tokens": 402221884.0, + "step": 15558 + }, + { + "epoch": 1.7086536349659565, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5597927570343018, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7070901393890381, + "num_tokens": 402243592.0, + "step": 15559 + }, + { + "epoch": 1.7087634526685702, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.41514253616333, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6934128403663635, + "num_tokens": 402268787.0, + "step": 15560 + }, + { + "epoch": 1.7088732703711838, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1734824180603027, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7013256549835205, + "num_tokens": 402299378.0, + "step": 15561 + }, + { + "epoch": 1.7089830880737975, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.291771650314331, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7186214923858643, + "num_tokens": 402324054.0, + "step": 15562 + }, + { + "epoch": 1.7090929057764113, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.375548839569092, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7172649502754211, + "num_tokens": 402346913.0, + "step": 15563 + }, + { + "epoch": 1.7092027234790248, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.380953311920166, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.69896000623703, + "num_tokens": 402373163.0, + "step": 15564 + }, + { + "epoch": 1.7093125411816383, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.398066520690918, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7343608736991882, + "num_tokens": 402396775.0, + "step": 15565 + }, + { + "epoch": 1.709422358884252, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2920079231262207, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7104801535606384, + "num_tokens": 402426353.0, + "step": 15566 + }, + { + "epoch": 1.7095321765868658, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0931015014648438, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6874942779541016, + "num_tokens": 402457270.0, + "step": 15567 + }, + { + "epoch": 1.7096419942894796, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2994871139526367, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7070431709289551, + "num_tokens": 402483166.0, + "step": 15568 + }, + { + "epoch": 1.7097518119920931, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2605743408203125, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7222294807434082, + "num_tokens": 402510246.0, + "step": 15569 + }, + { + "epoch": 1.7098616296947067, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.098468780517578, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7190045118331909, + "num_tokens": 402540081.0, + "step": 15570 + }, + { + "epoch": 1.7099714473973204, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1572306156158447, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6932772994041443, + "num_tokens": 402570877.0, + "step": 15571 + }, + { + "epoch": 1.7100812650999342, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5041606426239014, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7215771675109863, + "num_tokens": 402594434.0, + "step": 15572 + }, + { + "epoch": 1.7101910828025477, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.392247438430786, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.718177080154419, + "num_tokens": 402618572.0, + "step": 15573 + }, + { + "epoch": 1.7103009005051615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6877834796905518, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7045772075653076, + "num_tokens": 402639406.0, + "step": 15574 + }, + { + "epoch": 1.710410718207775, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8252134323120117, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7001495361328125, + "num_tokens": 402658305.0, + "step": 15575 + }, + { + "epoch": 1.7105205359103888, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.174152135848999, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.689216136932373, + "num_tokens": 402690186.0, + "step": 15576 + }, + { + "epoch": 1.7106303536130025, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.262006998062134, + "learning_rate": 1e-06, + "loss": 1.1137, + "mean_token_accuracy": 0.6790317296981812, + "num_tokens": 402722651.0, + "step": 15577 + }, + { + "epoch": 1.710740171315616, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5132884979248047, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7298054695129395, + "num_tokens": 402746767.0, + "step": 15578 + }, + { + "epoch": 1.7108499890182296, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1167073249816895, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7183898687362671, + "num_tokens": 402777349.0, + "step": 15579 + }, + { + "epoch": 1.7109598067208434, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.219633102416992, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7055838108062744, + "num_tokens": 402806854.0, + "step": 15580 + }, + { + "epoch": 1.7110696244234571, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2308263778686523, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6959316730499268, + "num_tokens": 402836077.0, + "step": 15581 + }, + { + "epoch": 1.7111794421260709, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4288151264190674, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6973682641983032, + "num_tokens": 402862507.0, + "step": 15582 + }, + { + "epoch": 1.7112892598286844, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0749640464782715, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7106320858001709, + "num_tokens": 402895496.0, + "step": 15583 + }, + { + "epoch": 1.711399077531298, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.370229959487915, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.707104504108429, + "num_tokens": 402922000.0, + "step": 15584 + }, + { + "epoch": 1.7115088952339117, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.382599353790283, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7038729190826416, + "num_tokens": 402946708.0, + "step": 15585 + }, + { + "epoch": 1.7116187129365255, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.288142681121826, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7037909030914307, + "num_tokens": 402974797.0, + "step": 15586 + }, + { + "epoch": 1.711728530639139, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.412296772003174, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.715526819229126, + "num_tokens": 402999664.0, + "step": 15587 + }, + { + "epoch": 1.7118383483417525, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.194550037384033, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6893991231918335, + "num_tokens": 403031461.0, + "step": 15588 + }, + { + "epoch": 1.7119481660443663, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3375184535980225, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7155492901802063, + "num_tokens": 403058704.0, + "step": 15589 + }, + { + "epoch": 1.71205798374698, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1751811504364014, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7261546850204468, + "num_tokens": 403089446.0, + "step": 15590 + }, + { + "epoch": 1.7121678014495938, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4284088611602783, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6862704753875732, + "num_tokens": 403116757.0, + "step": 15591 + }, + { + "epoch": 1.7122776191522073, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.438854932785034, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7167787551879883, + "num_tokens": 403138993.0, + "step": 15592 + }, + { + "epoch": 1.7123874368548209, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4324190616607666, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.707732617855072, + "num_tokens": 403162916.0, + "step": 15593 + }, + { + "epoch": 1.7124972545574346, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1819374561309814, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7138012051582336, + "num_tokens": 403191582.0, + "step": 15594 + }, + { + "epoch": 1.7126070722600484, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.269028425216675, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6973699331283569, + "num_tokens": 403218747.0, + "step": 15595 + }, + { + "epoch": 1.7127168899626621, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1957883834838867, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6941403150558472, + "num_tokens": 403247716.0, + "step": 15596 + }, + { + "epoch": 1.7128267076652757, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.153108835220337, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7183865904808044, + "num_tokens": 403277059.0, + "step": 15597 + }, + { + "epoch": 1.7129365253678892, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.332231283187866, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6967688798904419, + "num_tokens": 403304477.0, + "step": 15598 + }, + { + "epoch": 1.713046343070503, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5624752044677734, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7252984642982483, + "num_tokens": 403327045.0, + "step": 15599 + }, + { + "epoch": 1.7131561607731167, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.726144313812256, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7249161005020142, + "num_tokens": 403346736.0, + "step": 15600 + }, + { + "epoch": 1.7132659784757303, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.428281545639038, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7129614353179932, + "num_tokens": 403372578.0, + "step": 15601 + }, + { + "epoch": 1.7133757961783438, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6414287090301514, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7051540613174438, + "num_tokens": 403393365.0, + "step": 15602 + }, + { + "epoch": 1.7134856138809575, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3360822200775146, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7042655944824219, + "num_tokens": 403419727.0, + "step": 15603 + }, + { + "epoch": 1.7135954315835713, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2419211864471436, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7117334604263306, + "num_tokens": 403447141.0, + "step": 15604 + }, + { + "epoch": 1.713705249286185, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.46208119392395, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7049604058265686, + "num_tokens": 403471657.0, + "step": 15605 + }, + { + "epoch": 1.7138150669887986, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3664681911468506, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7129802703857422, + "num_tokens": 403496854.0, + "step": 15606 + }, + { + "epoch": 1.7139248846914121, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.287198066711426, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7071354389190674, + "num_tokens": 403524503.0, + "step": 15607 + }, + { + "epoch": 1.7140347023940259, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2719433307647705, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.69248366355896, + "num_tokens": 403554962.0, + "step": 15608 + }, + { + "epoch": 1.7141445200966396, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0999011993408203, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7034196853637695, + "num_tokens": 403587714.0, + "step": 15609 + }, + { + "epoch": 1.7142543377992534, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3430299758911133, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6915961503982544, + "num_tokens": 403616592.0, + "step": 15610 + }, + { + "epoch": 1.714364155501867, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3566811084747314, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7143862247467041, + "num_tokens": 403641132.0, + "step": 15611 + }, + { + "epoch": 1.7144739732044805, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.043508768081665, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7088789939880371, + "num_tokens": 403673395.0, + "step": 15612 + }, + { + "epoch": 1.7145837909070942, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.565455198287964, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7401553392410278, + "num_tokens": 403695074.0, + "step": 15613 + }, + { + "epoch": 1.714693608609708, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4989707469940186, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.702196478843689, + "num_tokens": 403722761.0, + "step": 15614 + }, + { + "epoch": 1.7148034263123215, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6505393981933594, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7022486925125122, + "num_tokens": 403743307.0, + "step": 15615 + }, + { + "epoch": 1.714913244014935, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4556543827056885, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7065534591674805, + "num_tokens": 403767386.0, + "step": 15616 + }, + { + "epoch": 1.7150230617175488, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3013854026794434, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7188246250152588, + "num_tokens": 403795522.0, + "step": 15617 + }, + { + "epoch": 1.7151328794201626, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.588298797607422, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7070956230163574, + "num_tokens": 403821048.0, + "step": 15618 + }, + { + "epoch": 1.7152426971227763, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.13535475730896, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7143571376800537, + "num_tokens": 403854081.0, + "step": 15619 + }, + { + "epoch": 1.7153525148253899, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.04585862159729, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7177093625068665, + "num_tokens": 403887105.0, + "step": 15620 + }, + { + "epoch": 1.7154623325280034, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4639711380004883, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7018566131591797, + "num_tokens": 403911293.0, + "step": 15621 + }, + { + "epoch": 1.7155721502306172, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5195844173431396, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7110663056373596, + "num_tokens": 403932423.0, + "step": 15622 + }, + { + "epoch": 1.715681967933231, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1512653827667236, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.7001280784606934, + "num_tokens": 403964248.0, + "step": 15623 + }, + { + "epoch": 1.7157917856358444, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.276179313659668, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7284085750579834, + "num_tokens": 403990120.0, + "step": 15624 + }, + { + "epoch": 1.7159016033384582, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6834845542907715, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7273693084716797, + "num_tokens": 404009131.0, + "step": 15625 + }, + { + "epoch": 1.7160114210410717, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2076218128204346, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.702606201171875, + "num_tokens": 404039592.0, + "step": 15626 + }, + { + "epoch": 1.7161212387436855, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5272750854492188, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7179739475250244, + "num_tokens": 404061611.0, + "step": 15627 + }, + { + "epoch": 1.7162310564462993, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3173043727874756, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6961618661880493, + "num_tokens": 404089891.0, + "step": 15628 + }, + { + "epoch": 1.7163408741489128, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.244729995727539, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.698469877243042, + "num_tokens": 404118032.0, + "step": 15629 + }, + { + "epoch": 1.7164506918515263, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8489532470703125, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7099929451942444, + "num_tokens": 404139583.0, + "step": 15630 + }, + { + "epoch": 1.71656050955414, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5341122150421143, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7049143314361572, + "num_tokens": 404162484.0, + "step": 15631 + }, + { + "epoch": 1.7166703272567538, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5406494140625, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7273467779159546, + "num_tokens": 404185439.0, + "step": 15632 + }, + { + "epoch": 1.7167801449593676, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.438568353652954, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7170896530151367, + "num_tokens": 404209797.0, + "step": 15633 + }, + { + "epoch": 1.7168899626619811, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5332205295562744, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7337230443954468, + "num_tokens": 404230140.0, + "step": 15634 + }, + { + "epoch": 1.7169997803645947, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5117340087890625, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.700995147228241, + "num_tokens": 404254200.0, + "step": 15635 + }, + { + "epoch": 1.7171095980672084, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4428493976593018, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.713626503944397, + "num_tokens": 404278519.0, + "step": 15636 + }, + { + "epoch": 1.7172194157698222, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4255545139312744, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6967513561248779, + "num_tokens": 404303770.0, + "step": 15637 + }, + { + "epoch": 1.7173292334724357, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.618536949157715, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7208994626998901, + "num_tokens": 404325138.0, + "step": 15638 + }, + { + "epoch": 1.7174390511750495, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.524191379547119, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6935166716575623, + "num_tokens": 404349688.0, + "step": 15639 + }, + { + "epoch": 1.717548868877663, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.095996618270874, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7032614350318909, + "num_tokens": 404382347.0, + "step": 15640 + }, + { + "epoch": 1.7176586865802768, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1826910972595215, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.707776665687561, + "num_tokens": 404409840.0, + "step": 15641 + }, + { + "epoch": 1.7177685042828905, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.190392255783081, + "learning_rate": 1e-06, + "loss": 1.1075, + "mean_token_accuracy": 0.6807090044021606, + "num_tokens": 404441008.0, + "step": 15642 + }, + { + "epoch": 1.717878321985504, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1748580932617188, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6985605359077454, + "num_tokens": 404468403.0, + "step": 15643 + }, + { + "epoch": 1.7179881396881176, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.243659496307373, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7111978530883789, + "num_tokens": 404497085.0, + "step": 15644 + }, + { + "epoch": 1.7180979573907313, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2821505069732666, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6946753263473511, + "num_tokens": 404529952.0, + "step": 15645 + }, + { + "epoch": 1.718207775093345, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1879518032073975, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.692981481552124, + "num_tokens": 404559855.0, + "step": 15646 + }, + { + "epoch": 1.7183175927959589, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3312041759490967, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7047522068023682, + "num_tokens": 404585447.0, + "step": 15647 + }, + { + "epoch": 1.7184274104985724, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2555012702941895, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.7038816213607788, + "num_tokens": 404615258.0, + "step": 15648 + }, + { + "epoch": 1.718537228201186, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4444851875305176, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.701226532459259, + "num_tokens": 404640487.0, + "step": 15649 + }, + { + "epoch": 1.7186470459037997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.31443190574646, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.692389726638794, + "num_tokens": 404669984.0, + "step": 15650 + }, + { + "epoch": 1.7187568636064134, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3360445499420166, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.7022721767425537, + "num_tokens": 404695795.0, + "step": 15651 + }, + { + "epoch": 1.718866681309027, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6357340812683105, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.723818302154541, + "num_tokens": 404716863.0, + "step": 15652 + }, + { + "epoch": 1.7189764990116405, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8784143924713135, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7217289209365845, + "num_tokens": 404733635.0, + "step": 15653 + }, + { + "epoch": 1.7190863167142543, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7528772354125977, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7255448698997498, + "num_tokens": 404754534.0, + "step": 15654 + }, + { + "epoch": 1.719196134416868, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.214827299118042, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6911268830299377, + "num_tokens": 404786352.0, + "step": 15655 + }, + { + "epoch": 1.7193059521194818, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.409815549850464, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7060982584953308, + "num_tokens": 404812670.0, + "step": 15656 + }, + { + "epoch": 1.7194157698220953, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5004780292510986, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7188094854354858, + "num_tokens": 404833971.0, + "step": 15657 + }, + { + "epoch": 1.7195255875247089, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0787508487701416, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.705602765083313, + "num_tokens": 404865009.0, + "step": 15658 + }, + { + "epoch": 1.7196354052273226, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3914127349853516, + "learning_rate": 1e-06, + "loss": 1.0985, + "mean_token_accuracy": 0.6895104050636292, + "num_tokens": 404891573.0, + "step": 15659 + }, + { + "epoch": 1.7197452229299364, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3938310146331787, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.7020896673202515, + "num_tokens": 404918890.0, + "step": 15660 + }, + { + "epoch": 1.7198550406325501, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.271982192993164, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7370839715003967, + "num_tokens": 404943132.0, + "step": 15661 + }, + { + "epoch": 1.7199648583351637, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3514246940612793, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.7006570100784302, + "num_tokens": 404970183.0, + "step": 15662 + }, + { + "epoch": 1.7200746760377772, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.340223550796509, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7037215232849121, + "num_tokens": 404996090.0, + "step": 15663 + }, + { + "epoch": 1.720184493740391, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1583571434020996, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7117685079574585, + "num_tokens": 405024775.0, + "step": 15664 + }, + { + "epoch": 1.7202943114430047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.165750503540039, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.695366621017456, + "num_tokens": 405056691.0, + "step": 15665 + }, + { + "epoch": 1.7204041291456182, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4453558921813965, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6986034512519836, + "num_tokens": 405082271.0, + "step": 15666 + }, + { + "epoch": 1.7205139468482318, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6469123363494873, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7066850066184998, + "num_tokens": 405104542.0, + "step": 15667 + }, + { + "epoch": 1.7206237645508455, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.183270215988159, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7159625291824341, + "num_tokens": 405135485.0, + "step": 15668 + }, + { + "epoch": 1.7207335822534593, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4774231910705566, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7169005870819092, + "num_tokens": 405159707.0, + "step": 15669 + }, + { + "epoch": 1.720843399956073, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4904963970184326, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7092756628990173, + "num_tokens": 405185350.0, + "step": 15670 + }, + { + "epoch": 1.7209532176586866, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2547359466552734, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7228823304176331, + "num_tokens": 405214522.0, + "step": 15671 + }, + { + "epoch": 1.7210630353613001, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.793851375579834, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7145886421203613, + "num_tokens": 405236811.0, + "step": 15672 + }, + { + "epoch": 1.7211728530639139, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1319589614868164, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7367834448814392, + "num_tokens": 405265540.0, + "step": 15673 + }, + { + "epoch": 1.7212826707665276, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3059613704681396, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6987375617027283, + "num_tokens": 405295474.0, + "step": 15674 + }, + { + "epoch": 1.7213924884691414, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4934134483337402, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7130359411239624, + "num_tokens": 405321196.0, + "step": 15675 + }, + { + "epoch": 1.721502306171755, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.177501678466797, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.692997932434082, + "num_tokens": 405351944.0, + "step": 15676 + }, + { + "epoch": 1.7216121238743685, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4682493209838867, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6997725963592529, + "num_tokens": 405378336.0, + "step": 15677 + }, + { + "epoch": 1.7217219415769822, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3011550903320312, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7276552319526672, + "num_tokens": 405404115.0, + "step": 15678 + }, + { + "epoch": 1.721831759279596, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.463745594024658, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7014657258987427, + "num_tokens": 405427676.0, + "step": 15679 + }, + { + "epoch": 1.7219415769822095, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2008376121520996, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6894741058349609, + "num_tokens": 405456316.0, + "step": 15680 + }, + { + "epoch": 1.722051394684823, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.644711494445801, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7036134600639343, + "num_tokens": 405478827.0, + "step": 15681 + }, + { + "epoch": 1.7221612123874368, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.373476028442383, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7313755750656128, + "num_tokens": 405503775.0, + "step": 15682 + }, + { + "epoch": 1.7222710300900506, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4039511680603027, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7006641030311584, + "num_tokens": 405530284.0, + "step": 15683 + }, + { + "epoch": 1.7223808477926643, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3599696159362793, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6987961530685425, + "num_tokens": 405560487.0, + "step": 15684 + }, + { + "epoch": 1.7224906654952779, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3947086334228516, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7199506759643555, + "num_tokens": 405586029.0, + "step": 15685 + }, + { + "epoch": 1.7226004831978914, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.188462972640991, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7093545794487, + "num_tokens": 405614303.0, + "step": 15686 + }, + { + "epoch": 1.7227103009005051, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4583747386932373, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7113680839538574, + "num_tokens": 405637574.0, + "step": 15687 + }, + { + "epoch": 1.722820118603119, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2902026176452637, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.685979962348938, + "num_tokens": 405664647.0, + "step": 15688 + }, + { + "epoch": 1.7229299363057324, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1409432888031006, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7210333347320557, + "num_tokens": 405695245.0, + "step": 15689 + }, + { + "epoch": 1.7230397540083462, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2322072982788086, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7296947836875916, + "num_tokens": 405721043.0, + "step": 15690 + }, + { + "epoch": 1.7231495717109597, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3324191570281982, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6961567401885986, + "num_tokens": 405748441.0, + "step": 15691 + }, + { + "epoch": 1.7232593894135735, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.582638740539551, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7057524919509888, + "num_tokens": 405770211.0, + "step": 15692 + }, + { + "epoch": 1.7233692071161872, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.280775785446167, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7258203625679016, + "num_tokens": 405799085.0, + "step": 15693 + }, + { + "epoch": 1.7234790248188008, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3744893074035645, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7233330607414246, + "num_tokens": 405825810.0, + "step": 15694 + }, + { + "epoch": 1.7235888425214143, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.345153331756592, + "learning_rate": 1e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6915674209594727, + "num_tokens": 405853812.0, + "step": 15695 + }, + { + "epoch": 1.723698660224028, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.600294828414917, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7212283611297607, + "num_tokens": 405879039.0, + "step": 15696 + }, + { + "epoch": 1.7238084779266418, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.245424270629883, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7045429944992065, + "num_tokens": 405908301.0, + "step": 15697 + }, + { + "epoch": 1.7239182956292556, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.739980459213257, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.727379322052002, + "num_tokens": 405928513.0, + "step": 15698 + }, + { + "epoch": 1.7240281133318691, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.27539324760437, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6982338428497314, + "num_tokens": 405958633.0, + "step": 15699 + }, + { + "epoch": 1.7241379310344827, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.387953996658325, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.721696138381958, + "num_tokens": 405983518.0, + "step": 15700 + }, + { + "epoch": 1.7242477487370964, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3232388496398926, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7196397185325623, + "num_tokens": 406009479.0, + "step": 15701 + }, + { + "epoch": 1.7243575664397102, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.687983751296997, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7123103141784668, + "num_tokens": 406030749.0, + "step": 15702 + }, + { + "epoch": 1.7244673841423237, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4602606296539307, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7077552080154419, + "num_tokens": 406056447.0, + "step": 15703 + }, + { + "epoch": 1.7245772018449375, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.8946380615234375, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7451819181442261, + "num_tokens": 406074410.0, + "step": 15704 + }, + { + "epoch": 1.724687019547551, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6161513328552246, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7320692539215088, + "num_tokens": 406096397.0, + "step": 15705 + }, + { + "epoch": 1.7247968372501647, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.37742018699646, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6924556493759155, + "num_tokens": 406123869.0, + "step": 15706 + }, + { + "epoch": 1.7249066549527785, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6989283561706543, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7384587526321411, + "num_tokens": 406147337.0, + "step": 15707 + }, + { + "epoch": 1.725016472655392, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4183452129364014, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7038030624389648, + "num_tokens": 406171769.0, + "step": 15708 + }, + { + "epoch": 1.7251262903580056, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.356865644454956, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7013904452323914, + "num_tokens": 406196379.0, + "step": 15709 + }, + { + "epoch": 1.7252361080606193, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2769291400909424, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7080205678939819, + "num_tokens": 406226674.0, + "step": 15710 + }, + { + "epoch": 1.725345925763233, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.266563892364502, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7212709188461304, + "num_tokens": 406251785.0, + "step": 15711 + }, + { + "epoch": 1.7254557434658468, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5630457401275635, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7300198078155518, + "num_tokens": 406272446.0, + "step": 15712 + }, + { + "epoch": 1.7255655611684604, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4006314277648926, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7172293663024902, + "num_tokens": 406297309.0, + "step": 15713 + }, + { + "epoch": 1.725675378871074, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.443315267562866, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7076007127761841, + "num_tokens": 406322257.0, + "step": 15714 + }, + { + "epoch": 1.7257851965736877, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5558178424835205, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7073070406913757, + "num_tokens": 406346570.0, + "step": 15715 + }, + { + "epoch": 1.7258950142763014, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.287122964859009, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7134435176849365, + "num_tokens": 406372128.0, + "step": 15716 + }, + { + "epoch": 1.726004831978915, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.652567148208618, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7320914268493652, + "num_tokens": 406393200.0, + "step": 15717 + }, + { + "epoch": 1.7261146496815285, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.346740484237671, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6830304861068726, + "num_tokens": 406421258.0, + "step": 15718 + }, + { + "epoch": 1.7262244673841423, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.82232403755188, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7277981042861938, + "num_tokens": 406454628.0, + "step": 15719 + }, + { + "epoch": 1.726334285086756, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2324132919311523, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7279198169708252, + "num_tokens": 406484194.0, + "step": 15720 + }, + { + "epoch": 1.7264441027893698, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.473396062850952, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7055965662002563, + "num_tokens": 406508641.0, + "step": 15721 + }, + { + "epoch": 1.7265539204919833, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2625911235809326, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6898682713508606, + "num_tokens": 406535886.0, + "step": 15722 + }, + { + "epoch": 1.7266637381945968, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6114840507507324, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7078542113304138, + "num_tokens": 406557711.0, + "step": 15723 + }, + { + "epoch": 1.7267735558972106, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.458777666091919, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7175531387329102, + "num_tokens": 406579582.0, + "step": 15724 + }, + { + "epoch": 1.7268833735998244, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.505986452102661, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7279326915740967, + "num_tokens": 406603065.0, + "step": 15725 + }, + { + "epoch": 1.7269931913024381, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.346240758895874, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7048986554145813, + "num_tokens": 406627249.0, + "step": 15726 + }, + { + "epoch": 1.7271030090050516, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3490123748779297, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7335353493690491, + "num_tokens": 406651523.0, + "step": 15727 + }, + { + "epoch": 1.7272128267076652, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.359931230545044, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.688460111618042, + "num_tokens": 406677929.0, + "step": 15728 + }, + { + "epoch": 1.727322644410279, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5103752613067627, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7062195539474487, + "num_tokens": 406703411.0, + "step": 15729 + }, + { + "epoch": 1.7274324621128927, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3830838203430176, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7058005332946777, + "num_tokens": 406727723.0, + "step": 15730 + }, + { + "epoch": 1.7275422798155062, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2932519912719727, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6942055225372314, + "num_tokens": 406755501.0, + "step": 15731 + }, + { + "epoch": 1.7276520975181198, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.747013807296753, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7226043939590454, + "num_tokens": 406775289.0, + "step": 15732 + }, + { + "epoch": 1.7277619152207335, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.322394371032715, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7213969230651855, + "num_tokens": 406804107.0, + "step": 15733 + }, + { + "epoch": 1.7278717329233473, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.261209011077881, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7077569961547852, + "num_tokens": 406832425.0, + "step": 15734 + }, + { + "epoch": 1.727981550625961, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3933377265930176, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7511353492736816, + "num_tokens": 406855526.0, + "step": 15735 + }, + { + "epoch": 1.7280913683285746, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5527029037475586, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7128773927688599, + "num_tokens": 406876907.0, + "step": 15736 + }, + { + "epoch": 1.728201186031188, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2726378440856934, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6906747817993164, + "num_tokens": 406904760.0, + "step": 15737 + }, + { + "epoch": 1.7283110037338019, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5087687969207764, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7154920101165771, + "num_tokens": 406928549.0, + "step": 15738 + }, + { + "epoch": 1.7284208214364156, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7089545726776123, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7063682079315186, + "num_tokens": 406949708.0, + "step": 15739 + }, + { + "epoch": 1.7285306391390292, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.23020076751709, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.7031931281089783, + "num_tokens": 406977852.0, + "step": 15740 + }, + { + "epoch": 1.728640456841643, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3729019165039062, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.7013506293296814, + "num_tokens": 407004183.0, + "step": 15741 + }, + { + "epoch": 1.7287502745442564, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.319418430328369, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7171567678451538, + "num_tokens": 407029641.0, + "step": 15742 + }, + { + "epoch": 1.7288600922468702, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1271979808807373, + "learning_rate": 1e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.6849517822265625, + "num_tokens": 407061311.0, + "step": 15743 + }, + { + "epoch": 1.728969909949484, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.531435966491699, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7255275249481201, + "num_tokens": 407083842.0, + "step": 15744 + }, + { + "epoch": 1.7290797276520975, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4689695835113525, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7041292190551758, + "num_tokens": 407106403.0, + "step": 15745 + }, + { + "epoch": 1.729189545354711, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.640143871307373, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7293004393577576, + "num_tokens": 407127611.0, + "step": 15746 + }, + { + "epoch": 1.7292993630573248, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.334083318710327, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6975908875465393, + "num_tokens": 407155998.0, + "step": 15747 + }, + { + "epoch": 1.7294091807599385, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.340306520462036, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7472400665283203, + "num_tokens": 407180192.0, + "step": 15748 + }, + { + "epoch": 1.7295189984625523, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.386925220489502, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6973707675933838, + "num_tokens": 407207098.0, + "step": 15749 + }, + { + "epoch": 1.7296288161651658, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.481656789779663, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7340826988220215, + "num_tokens": 407229106.0, + "step": 15750 + }, + { + "epoch": 1.7297386338677794, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5370869636535645, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7263725996017456, + "num_tokens": 407251733.0, + "step": 15751 + }, + { + "epoch": 1.7298484515703931, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4823665618896484, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6996088027954102, + "num_tokens": 407276925.0, + "step": 15752 + }, + { + "epoch": 1.7299582692730069, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.385632038116455, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7064743041992188, + "num_tokens": 407301318.0, + "step": 15753 + }, + { + "epoch": 1.7300680869756204, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.647212028503418, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7219959497451782, + "num_tokens": 407323652.0, + "step": 15754 + }, + { + "epoch": 1.7301779046782342, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1258773803710938, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7100961804389954, + "num_tokens": 407353689.0, + "step": 15755 + }, + { + "epoch": 1.7302877223808477, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.454862356185913, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6958463191986084, + "num_tokens": 407378942.0, + "step": 15756 + }, + { + "epoch": 1.7303975400834615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.425516366958618, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7135018706321716, + "num_tokens": 407404492.0, + "step": 15757 + }, + { + "epoch": 1.7305073577860752, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4614667892456055, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7117149233818054, + "num_tokens": 407429503.0, + "step": 15758 + }, + { + "epoch": 1.7306171754886888, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2210354804992676, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7122777700424194, + "num_tokens": 407459862.0, + "step": 15759 + }, + { + "epoch": 1.7307269931913023, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.549481153488159, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.720493495464325, + "num_tokens": 407485344.0, + "step": 15760 + }, + { + "epoch": 1.730836810893916, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4839744567871094, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7180178165435791, + "num_tokens": 407508639.0, + "step": 15761 + }, + { + "epoch": 1.7309466285965298, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3610687255859375, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7064392566680908, + "num_tokens": 407535977.0, + "step": 15762 + }, + { + "epoch": 1.7310564462991436, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.485351085662842, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7277227640151978, + "num_tokens": 407557839.0, + "step": 15763 + }, + { + "epoch": 1.731166264001757, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.678581714630127, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7199934124946594, + "num_tokens": 407577992.0, + "step": 15764 + }, + { + "epoch": 1.7312760817043706, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4346790313720703, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7308157682418823, + "num_tokens": 407602088.0, + "step": 15765 + }, + { + "epoch": 1.7313858994069844, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.733254909515381, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7277141809463501, + "num_tokens": 407624483.0, + "step": 15766 + }, + { + "epoch": 1.7314957171095982, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.628962516784668, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6907278895378113, + "num_tokens": 407648718.0, + "step": 15767 + }, + { + "epoch": 1.7316055348122117, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5392773151397705, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7126895785331726, + "num_tokens": 407671524.0, + "step": 15768 + }, + { + "epoch": 1.7317153525148252, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5132339000701904, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.7083660364151001, + "num_tokens": 407696736.0, + "step": 15769 + }, + { + "epoch": 1.731825170217439, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5645253658294678, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7266409993171692, + "num_tokens": 407719242.0, + "step": 15770 + }, + { + "epoch": 1.7319349879200527, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.9480273723602295, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7197786569595337, + "num_tokens": 407741647.0, + "step": 15771 + }, + { + "epoch": 1.7320448056226665, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.464893102645874, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7198439836502075, + "num_tokens": 407766431.0, + "step": 15772 + }, + { + "epoch": 1.73215462332528, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.310774803161621, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7115238904953003, + "num_tokens": 407791725.0, + "step": 15773 + }, + { + "epoch": 1.7322644410278936, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5870399475097656, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6899615526199341, + "num_tokens": 407814167.0, + "step": 15774 + }, + { + "epoch": 1.7323742587305073, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.214568853378296, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6893917322158813, + "num_tokens": 407843181.0, + "step": 15775 + }, + { + "epoch": 1.732484076433121, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4777121543884277, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6872357726097107, + "num_tokens": 407868658.0, + "step": 15776 + }, + { + "epoch": 1.7325938941357348, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.301708221435547, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7274366021156311, + "num_tokens": 407895419.0, + "step": 15777 + }, + { + "epoch": 1.7327037118383484, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3934669494628906, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7210344076156616, + "num_tokens": 407920037.0, + "step": 15778 + }, + { + "epoch": 1.732813529540962, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5154173374176025, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7078843116760254, + "num_tokens": 407943819.0, + "step": 15779 + }, + { + "epoch": 1.7329233472435757, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2003262042999268, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7020160555839539, + "num_tokens": 407974168.0, + "step": 15780 + }, + { + "epoch": 1.7330331649461894, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4160969257354736, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7198972105979919, + "num_tokens": 408000107.0, + "step": 15781 + }, + { + "epoch": 1.733142982648803, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.307115316390991, + "learning_rate": 1e-06, + "loss": 1.1061, + "mean_token_accuracy": 0.6748884916305542, + "num_tokens": 408029346.0, + "step": 15782 + }, + { + "epoch": 1.7332528003514165, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5430421829223633, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6955592632293701, + "num_tokens": 408054076.0, + "step": 15783 + }, + { + "epoch": 1.7333626180540302, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1163294315338135, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6962316036224365, + "num_tokens": 408087889.0, + "step": 15784 + }, + { + "epoch": 1.733472435756644, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.254319190979004, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7075930237770081, + "num_tokens": 408115327.0, + "step": 15785 + }, + { + "epoch": 1.7335822534592578, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6047043800354004, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6881781816482544, + "num_tokens": 408139826.0, + "step": 15786 + }, + { + "epoch": 1.7336920711618713, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.558330774307251, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7064123153686523, + "num_tokens": 408163102.0, + "step": 15787 + }, + { + "epoch": 1.7338018888644848, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7221477031707764, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7069679498672485, + "num_tokens": 408186929.0, + "step": 15788 + }, + { + "epoch": 1.7339117065670986, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5337867736816406, + "learning_rate": 1e-06, + "loss": 1.1032, + "mean_token_accuracy": 0.6770362257957458, + "num_tokens": 408213628.0, + "step": 15789 + }, + { + "epoch": 1.7340215242697123, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2972652912139893, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.690256655216217, + "num_tokens": 408244207.0, + "step": 15790 + }, + { + "epoch": 1.734131341972326, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0861001014709473, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7207126617431641, + "num_tokens": 408273952.0, + "step": 15791 + }, + { + "epoch": 1.7342411596749396, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.358837604522705, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7133413553237915, + "num_tokens": 408300411.0, + "step": 15792 + }, + { + "epoch": 1.7343509773775532, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.65450382232666, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7279331088066101, + "num_tokens": 408321815.0, + "step": 15793 + }, + { + "epoch": 1.734460795080167, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.662060499191284, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7378666400909424, + "num_tokens": 408343248.0, + "step": 15794 + }, + { + "epoch": 1.7345706127827807, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.496220588684082, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6938201785087585, + "num_tokens": 408368877.0, + "step": 15795 + }, + { + "epoch": 1.7346804304853942, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4726357460021973, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6920024752616882, + "num_tokens": 408394331.0, + "step": 15796 + }, + { + "epoch": 1.7347902481880078, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3256092071533203, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6930434703826904, + "num_tokens": 408419574.0, + "step": 15797 + }, + { + "epoch": 1.7349000658906215, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.251723289489746, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7110103964805603, + "num_tokens": 408446745.0, + "step": 15798 + }, + { + "epoch": 1.7350098835932353, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4403765201568604, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7045964002609253, + "num_tokens": 408470346.0, + "step": 15799 + }, + { + "epoch": 1.735119701295849, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3378515243530273, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7040932178497314, + "num_tokens": 408497927.0, + "step": 15800 + }, + { + "epoch": 1.7352295189984626, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4779651165008545, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6994807124137878, + "num_tokens": 408523528.0, + "step": 15801 + }, + { + "epoch": 1.735339336701076, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0140221118927, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7007330656051636, + "num_tokens": 408557516.0, + "step": 15802 + }, + { + "epoch": 1.7354491544036899, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.509941577911377, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7023571729660034, + "num_tokens": 408582193.0, + "step": 15803 + }, + { + "epoch": 1.7355589721063036, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7089004516601562, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7202469110488892, + "num_tokens": 408602622.0, + "step": 15804 + }, + { + "epoch": 1.7356687898089171, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.56396484375, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.715144157409668, + "num_tokens": 408624316.0, + "step": 15805 + }, + { + "epoch": 1.735778607511531, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3491969108581543, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7176967263221741, + "num_tokens": 408652235.0, + "step": 15806 + }, + { + "epoch": 1.7358884252141444, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5158443450927734, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7041784524917603, + "num_tokens": 408677377.0, + "step": 15807 + }, + { + "epoch": 1.7359982429167582, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.435729742050171, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7054803967475891, + "num_tokens": 408703453.0, + "step": 15808 + }, + { + "epoch": 1.736108060619372, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5841283798217773, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7337518930435181, + "num_tokens": 408727630.0, + "step": 15809 + }, + { + "epoch": 1.7362178783219855, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.49491024017334, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7028021812438965, + "num_tokens": 408750923.0, + "step": 15810 + }, + { + "epoch": 1.736327696024599, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4288744926452637, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7028499841690063, + "num_tokens": 408778970.0, + "step": 15811 + }, + { + "epoch": 1.7364375137272128, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.493269205093384, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7340850234031677, + "num_tokens": 408803405.0, + "step": 15812 + }, + { + "epoch": 1.7365473314298265, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.457345724105835, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6904981136322021, + "num_tokens": 408828951.0, + "step": 15813 + }, + { + "epoch": 1.7366571491324403, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1274046897888184, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6722564101219177, + "num_tokens": 408864782.0, + "step": 15814 + }, + { + "epoch": 1.7367669668350538, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8878207206726074, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7289639711380005, + "num_tokens": 408883987.0, + "step": 15815 + }, + { + "epoch": 1.7368767845376674, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3766727447509766, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6948404312133789, + "num_tokens": 408911114.0, + "step": 15816 + }, + { + "epoch": 1.7369866022402811, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6269166469573975, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7265793085098267, + "num_tokens": 408932732.0, + "step": 15817 + }, + { + "epoch": 1.7370964199428949, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4866366386413574, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7309474945068359, + "num_tokens": 408956139.0, + "step": 15818 + }, + { + "epoch": 1.7372062376455084, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4657270908355713, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7224041223526001, + "num_tokens": 408979450.0, + "step": 15819 + }, + { + "epoch": 1.7373160553481222, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.589901924133301, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7292464971542358, + "num_tokens": 409002261.0, + "step": 15820 + }, + { + "epoch": 1.7374258730507357, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 8.537260055541992, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7022936344146729, + "num_tokens": 409031473.0, + "step": 15821 + }, + { + "epoch": 1.7375356907533495, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.702934980392456, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7133087515830994, + "num_tokens": 409053824.0, + "step": 15822 + }, + { + "epoch": 1.7376455084559632, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.490323305130005, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6993489861488342, + "num_tokens": 409081507.0, + "step": 15823 + }, + { + "epoch": 1.7377553261585768, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.637820243835449, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7362064123153687, + "num_tokens": 409104613.0, + "step": 15824 + }, + { + "epoch": 1.7378651438611903, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3580312728881836, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7051272392272949, + "num_tokens": 409131953.0, + "step": 15825 + }, + { + "epoch": 1.737974961563804, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5093181133270264, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7033343315124512, + "num_tokens": 409156621.0, + "step": 15826 + }, + { + "epoch": 1.7380847792664178, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3869850635528564, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7003325819969177, + "num_tokens": 409183001.0, + "step": 15827 + }, + { + "epoch": 1.7381945969690316, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.599904775619507, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7230612635612488, + "num_tokens": 409203539.0, + "step": 15828 + }, + { + "epoch": 1.738304414671645, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.311464309692383, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.715530514717102, + "num_tokens": 409230142.0, + "step": 15829 + }, + { + "epoch": 1.7384142323742586, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3734495639801025, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6938877105712891, + "num_tokens": 409255909.0, + "step": 15830 + }, + { + "epoch": 1.7385240500768724, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.314851999282837, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7243994474411011, + "num_tokens": 409282733.0, + "step": 15831 + }, + { + "epoch": 1.7386338677794861, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4863486289978027, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7224519848823547, + "num_tokens": 409309821.0, + "step": 15832 + }, + { + "epoch": 1.7387436854820997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.45688533782959, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7361905574798584, + "num_tokens": 409334055.0, + "step": 15833 + }, + { + "epoch": 1.7388535031847132, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6886963844299316, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.708295464515686, + "num_tokens": 409354915.0, + "step": 15834 + }, + { + "epoch": 1.738963320887327, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.015202760696411, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7380380034446716, + "num_tokens": 409372334.0, + "step": 15835 + }, + { + "epoch": 1.7390731385899407, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2377266883850098, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6865570545196533, + "num_tokens": 409401713.0, + "step": 15836 + }, + { + "epoch": 1.7391829562925545, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2227227687835693, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.6871442794799805, + "num_tokens": 409431433.0, + "step": 15837 + }, + { + "epoch": 1.739292773995168, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.409090042114258, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7131212949752808, + "num_tokens": 409454487.0, + "step": 15838 + }, + { + "epoch": 1.7394025916977816, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.367337465286255, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7166754603385925, + "num_tokens": 409479958.0, + "step": 15839 + }, + { + "epoch": 1.7395124094003953, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.235426902770996, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7156904935836792, + "num_tokens": 409509442.0, + "step": 15840 + }, + { + "epoch": 1.739622227103009, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6576969623565674, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6994683742523193, + "num_tokens": 409530593.0, + "step": 15841 + }, + { + "epoch": 1.7397320448056228, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.6268746852874756, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7073055505752563, + "num_tokens": 409563789.0, + "step": 15842 + }, + { + "epoch": 1.7398418625082364, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2040250301361084, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7006793022155762, + "num_tokens": 409592783.0, + "step": 15843 + }, + { + "epoch": 1.73995168021085, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4178977012634277, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7067598104476929, + "num_tokens": 409617309.0, + "step": 15844 + }, + { + "epoch": 1.7400614979134637, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4413251876831055, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.702764630317688, + "num_tokens": 409642941.0, + "step": 15845 + }, + { + "epoch": 1.7401713156160774, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.286526679992676, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7192313075065613, + "num_tokens": 409669861.0, + "step": 15846 + }, + { + "epoch": 1.740281133318691, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.583799362182617, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6989534497261047, + "num_tokens": 409691814.0, + "step": 15847 + }, + { + "epoch": 1.7403909510213045, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.508108615875244, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.7019845843315125, + "num_tokens": 409717653.0, + "step": 15848 + }, + { + "epoch": 1.7405007687239182, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.671043872833252, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7139418125152588, + "num_tokens": 409738514.0, + "step": 15849 + }, + { + "epoch": 1.740610586426532, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.54843807220459, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6958468556404114, + "num_tokens": 409762917.0, + "step": 15850 + }, + { + "epoch": 1.7407204041291457, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4965200424194336, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7185519933700562, + "num_tokens": 409786764.0, + "step": 15851 + }, + { + "epoch": 1.7408302218317593, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4184608459472656, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7033059000968933, + "num_tokens": 409814012.0, + "step": 15852 + }, + { + "epoch": 1.7409400395343728, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.317739248275757, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7047973871231079, + "num_tokens": 409841290.0, + "step": 15853 + }, + { + "epoch": 1.7410498572369866, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.446726083755493, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7049299478530884, + "num_tokens": 409865997.0, + "step": 15854 + }, + { + "epoch": 1.7411596749396003, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5266382694244385, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.694650411605835, + "num_tokens": 409888915.0, + "step": 15855 + }, + { + "epoch": 1.741269492642214, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.033950090408325, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6903191208839417, + "num_tokens": 409922378.0, + "step": 15856 + }, + { + "epoch": 1.7413793103448276, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.328235626220703, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7448353171348572, + "num_tokens": 409945678.0, + "step": 15857 + }, + { + "epoch": 1.7414891280474412, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4607319831848145, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7040023803710938, + "num_tokens": 409968835.0, + "step": 15858 + }, + { + "epoch": 1.741598945750055, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2515861988067627, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7032738924026489, + "num_tokens": 409997469.0, + "step": 15859 + }, + { + "epoch": 1.7417087634526687, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.418168783187866, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6964560747146606, + "num_tokens": 410022375.0, + "step": 15860 + }, + { + "epoch": 1.7418185811552822, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.201463460922241, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6838865280151367, + "num_tokens": 410054020.0, + "step": 15861 + }, + { + "epoch": 1.7419283988578957, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3627727031707764, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6958881616592407, + "num_tokens": 410079572.0, + "step": 15862 + }, + { + "epoch": 1.7420382165605095, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.215789556503296, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7215498685836792, + "num_tokens": 410106514.0, + "step": 15863 + }, + { + "epoch": 1.7421480342631233, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3612148761749268, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7070031762123108, + "num_tokens": 410132343.0, + "step": 15864 + }, + { + "epoch": 1.742257851965737, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.230966567993164, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7112118601799011, + "num_tokens": 410161866.0, + "step": 15865 + }, + { + "epoch": 1.7423676696683505, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.388148784637451, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6859934329986572, + "num_tokens": 410187236.0, + "step": 15866 + }, + { + "epoch": 1.742477487370964, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.250128746032715, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7092278003692627, + "num_tokens": 410216432.0, + "step": 15867 + }, + { + "epoch": 1.7425873050735778, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1979076862335205, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7167531251907349, + "num_tokens": 410244252.0, + "step": 15868 + }, + { + "epoch": 1.7426971227761916, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3111484050750732, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.692890465259552, + "num_tokens": 410271135.0, + "step": 15869 + }, + { + "epoch": 1.7428069404788051, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3742918968200684, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7110291123390198, + "num_tokens": 410296705.0, + "step": 15870 + }, + { + "epoch": 1.742916758181419, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.355017900466919, + "learning_rate": 1e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.6777837872505188, + "num_tokens": 410324785.0, + "step": 15871 + }, + { + "epoch": 1.7430265758840324, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.350498914718628, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7199006080627441, + "num_tokens": 410350793.0, + "step": 15872 + }, + { + "epoch": 1.7431363935866462, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3123743534088135, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.713628888130188, + "num_tokens": 410380193.0, + "step": 15873 + }, + { + "epoch": 1.74324621128926, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4776995182037354, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7338935136795044, + "num_tokens": 410403374.0, + "step": 15874 + }, + { + "epoch": 1.7433560289918735, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.247493267059326, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6971994638442993, + "num_tokens": 410431523.0, + "step": 15875 + }, + { + "epoch": 1.743465846694487, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.458378791809082, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7275212407112122, + "num_tokens": 410455780.0, + "step": 15876 + }, + { + "epoch": 1.7435756643971008, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3218464851379395, + "learning_rate": 1e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6910545229911804, + "num_tokens": 410485812.0, + "step": 15877 + }, + { + "epoch": 1.7436854820997145, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.750675916671753, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7026126980781555, + "num_tokens": 410507164.0, + "step": 15878 + }, + { + "epoch": 1.7437952998023283, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.345674753189087, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7201764583587646, + "num_tokens": 410534324.0, + "step": 15879 + }, + { + "epoch": 1.7439051175049418, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 3.7923977375030518, + "learning_rate": 1e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.6752939820289612, + "num_tokens": 410564035.0, + "step": 15880 + }, + { + "epoch": 1.7440149352075554, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.835273265838623, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7186248898506165, + "num_tokens": 410582722.0, + "step": 15881 + }, + { + "epoch": 1.744124752910169, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4301180839538574, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7252774834632874, + "num_tokens": 410606232.0, + "step": 15882 + }, + { + "epoch": 1.7442345706127829, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3590967655181885, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6989789009094238, + "num_tokens": 410631951.0, + "step": 15883 + }, + { + "epoch": 1.7443443883153964, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1447160243988037, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7412594556808472, + "num_tokens": 410660421.0, + "step": 15884 + }, + { + "epoch": 1.7444542060180102, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.38313627243042, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6954228281974792, + "num_tokens": 410688148.0, + "step": 15885 + }, + { + "epoch": 1.7445640237206237, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1212830543518066, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6945030689239502, + "num_tokens": 410721790.0, + "step": 15886 + }, + { + "epoch": 1.7446738414232374, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8772475719451904, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7213442325592041, + "num_tokens": 410740257.0, + "step": 15887 + }, + { + "epoch": 1.7447836591258512, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3553411960601807, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7250125408172607, + "num_tokens": 410763734.0, + "step": 15888 + }, + { + "epoch": 1.7448934768284647, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6112406253814697, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7198583483695984, + "num_tokens": 410785032.0, + "step": 15889 + }, + { + "epoch": 1.7450032945310783, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.361119031906128, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6833410263061523, + "num_tokens": 410810970.0, + "step": 15890 + }, + { + "epoch": 1.745113112233692, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.350273370742798, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6842449903488159, + "num_tokens": 410841451.0, + "step": 15891 + }, + { + "epoch": 1.7452229299363058, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5376250743865967, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7128719687461853, + "num_tokens": 410865315.0, + "step": 15892 + }, + { + "epoch": 1.7453327476389195, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4909234046936035, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7011348009109497, + "num_tokens": 410889008.0, + "step": 15893 + }, + { + "epoch": 1.745442565341533, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.621610164642334, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6937071681022644, + "num_tokens": 410912808.0, + "step": 15894 + }, + { + "epoch": 1.7455523830441466, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.335456609725952, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7098174095153809, + "num_tokens": 410940907.0, + "step": 15895 + }, + { + "epoch": 1.7456622007467604, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.435396194458008, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7089620232582092, + "num_tokens": 410964706.0, + "step": 15896 + }, + { + "epoch": 1.7457720184493741, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.330418825149536, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7004818916320801, + "num_tokens": 410992747.0, + "step": 15897 + }, + { + "epoch": 1.7458818361519877, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5303215980529785, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7466016411781311, + "num_tokens": 411016194.0, + "step": 15898 + }, + { + "epoch": 1.7459916538546012, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6479923725128174, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6943109631538391, + "num_tokens": 411037537.0, + "step": 15899 + }, + { + "epoch": 1.746101471557215, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4592976570129395, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.719488799571991, + "num_tokens": 411061701.0, + "step": 15900 + }, + { + "epoch": 1.7462112892598287, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.346648693084717, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7163440585136414, + "num_tokens": 411086094.0, + "step": 15901 + }, + { + "epoch": 1.7463211069624425, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5996506214141846, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.705238938331604, + "num_tokens": 411107452.0, + "step": 15902 + }, + { + "epoch": 1.746430924665056, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4758760929107666, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7299275398254395, + "num_tokens": 411129754.0, + "step": 15903 + }, + { + "epoch": 1.7465407423676695, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.623918056488037, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7003786563873291, + "num_tokens": 411155350.0, + "step": 15904 + }, + { + "epoch": 1.7466505600702833, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2152106761932373, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7115664482116699, + "num_tokens": 411183490.0, + "step": 15905 + }, + { + "epoch": 1.746760377772897, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.61287522315979, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6997265815734863, + "num_tokens": 411210915.0, + "step": 15906 + }, + { + "epoch": 1.7468701954755108, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3639583587646484, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.717018723487854, + "num_tokens": 411237205.0, + "step": 15907 + }, + { + "epoch": 1.7469800131781243, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.349515199661255, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6966918110847473, + "num_tokens": 411264664.0, + "step": 15908 + }, + { + "epoch": 1.7470898308807379, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3803892135620117, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7098866701126099, + "num_tokens": 411291043.0, + "step": 15909 + }, + { + "epoch": 1.7471996485833516, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.586013078689575, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6957942247390747, + "num_tokens": 411313227.0, + "step": 15910 + }, + { + "epoch": 1.7473094662859654, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5075976848602295, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6886132955551147, + "num_tokens": 411339882.0, + "step": 15911 + }, + { + "epoch": 1.747419283988579, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4702506065368652, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7083798050880432, + "num_tokens": 411366020.0, + "step": 15912 + }, + { + "epoch": 1.7475291016911925, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4041945934295654, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7187348008155823, + "num_tokens": 411388987.0, + "step": 15913 + }, + { + "epoch": 1.7476389193938062, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.38960337638855, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.72539222240448, + "num_tokens": 411417028.0, + "step": 15914 + }, + { + "epoch": 1.74774873709642, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.724701166152954, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.7009831666946411, + "num_tokens": 411439589.0, + "step": 15915 + }, + { + "epoch": 1.7478585547990337, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7572755813598633, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6896594762802124, + "num_tokens": 411461548.0, + "step": 15916 + }, + { + "epoch": 1.7479683725016473, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2113895416259766, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7216389775276184, + "num_tokens": 411489361.0, + "step": 15917 + }, + { + "epoch": 1.7480781902042608, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1769702434539795, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7032259702682495, + "num_tokens": 411519407.0, + "step": 15918 + }, + { + "epoch": 1.7481880079068746, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1821823120117188, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7150964736938477, + "num_tokens": 411547927.0, + "step": 15919 + }, + { + "epoch": 1.7482978256094883, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.364387035369873, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7026488184928894, + "num_tokens": 411575453.0, + "step": 15920 + }, + { + "epoch": 1.7484076433121019, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2987921237945557, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7216867208480835, + "num_tokens": 411603169.0, + "step": 15921 + }, + { + "epoch": 1.7485174610147156, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1237218379974365, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6945279836654663, + "num_tokens": 411635310.0, + "step": 15922 + }, + { + "epoch": 1.7486272787173291, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1194024085998535, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7288357615470886, + "num_tokens": 411666693.0, + "step": 15923 + }, + { + "epoch": 1.748737096419943, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4198083877563477, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6998904943466187, + "num_tokens": 411694091.0, + "step": 15924 + }, + { + "epoch": 1.7488469141225567, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.7575955390930176, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7108474969863892, + "num_tokens": 411715521.0, + "step": 15925 + }, + { + "epoch": 1.7489567318251702, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.199849843978882, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7132267951965332, + "num_tokens": 411744107.0, + "step": 15926 + }, + { + "epoch": 1.7490665495277837, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.130324602127075, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7017194628715515, + "num_tokens": 411774228.0, + "step": 15927 + }, + { + "epoch": 1.7491763672303975, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.972043514251709, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7186141014099121, + "num_tokens": 411791279.0, + "step": 15928 + }, + { + "epoch": 1.7492861849330112, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5172338485717773, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7207473516464233, + "num_tokens": 411813971.0, + "step": 15929 + }, + { + "epoch": 1.749396002635625, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6132640838623047, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7127854228019714, + "num_tokens": 411836624.0, + "step": 15930 + }, + { + "epoch": 1.7495058203382385, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4041545391082764, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7081575393676758, + "num_tokens": 411862879.0, + "step": 15931 + }, + { + "epoch": 1.749615638040852, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2742795944213867, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.712719202041626, + "num_tokens": 411890777.0, + "step": 15932 + }, + { + "epoch": 1.7497254557434658, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0208828449249268, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6984800696372986, + "num_tokens": 411929648.0, + "step": 15933 + }, + { + "epoch": 1.7498352734460796, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.093141794204712, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7332379221916199, + "num_tokens": 411959716.0, + "step": 15934 + }, + { + "epoch": 1.7499450911486931, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3923697471618652, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7117911577224731, + "num_tokens": 411984294.0, + "step": 15935 + }, + { + "epoch": 1.7500549088513069, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.339517831802368, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7073586583137512, + "num_tokens": 412008278.0, + "step": 15936 + }, + { + "epoch": 1.7501647265539204, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.361220121383667, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7237297296524048, + "num_tokens": 412035117.0, + "step": 15937 + }, + { + "epoch": 1.7502745442565342, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.382220506668091, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.713506281375885, + "num_tokens": 412059205.0, + "step": 15938 + }, + { + "epoch": 1.750384361959148, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6390061378479004, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6991156339645386, + "num_tokens": 412082773.0, + "step": 15939 + }, + { + "epoch": 1.7504941796617615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2660253047943115, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7074154615402222, + "num_tokens": 412111838.0, + "step": 15940 + }, + { + "epoch": 1.750603997364375, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.8896477222442627, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7222398519515991, + "num_tokens": 412131623.0, + "step": 15941 + }, + { + "epoch": 1.7507138150669888, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2904107570648193, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7086129188537598, + "num_tokens": 412161242.0, + "step": 15942 + }, + { + "epoch": 1.7508236327696025, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.645080804824829, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7281432151794434, + "num_tokens": 412182544.0, + "step": 15943 + }, + { + "epoch": 1.7509334504722163, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.505840301513672, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6998034119606018, + "num_tokens": 412207490.0, + "step": 15944 + }, + { + "epoch": 1.7510432681748298, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3085198402404785, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6986974477767944, + "num_tokens": 412235966.0, + "step": 15945 + }, + { + "epoch": 1.7511530858774433, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6151323318481445, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7172828316688538, + "num_tokens": 412259956.0, + "step": 15946 + }, + { + "epoch": 1.751262903580057, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.314484119415283, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.689265251159668, + "num_tokens": 412289410.0, + "step": 15947 + }, + { + "epoch": 1.7513727212826709, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3596882820129395, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7092668414115906, + "num_tokens": 412316136.0, + "step": 15948 + }, + { + "epoch": 1.7514825389852844, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.260885000228882, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7166960835456848, + "num_tokens": 412344879.0, + "step": 15949 + }, + { + "epoch": 1.7515923566878981, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.219550371170044, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7118237614631653, + "num_tokens": 412371632.0, + "step": 15950 + }, + { + "epoch": 1.7517021743905117, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1107752323150635, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7220115661621094, + "num_tokens": 412400581.0, + "step": 15951 + }, + { + "epoch": 1.7518119920931254, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5155622959136963, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6890435218811035, + "num_tokens": 412425480.0, + "step": 15952 + }, + { + "epoch": 1.7519218097957392, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.55092716217041, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.713780403137207, + "num_tokens": 412448255.0, + "step": 15953 + }, + { + "epoch": 1.7520316274983527, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4533510208129883, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7091308832168579, + "num_tokens": 412471255.0, + "step": 15954 + }, + { + "epoch": 1.7521414452009663, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5809502601623535, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7316759824752808, + "num_tokens": 412491773.0, + "step": 15955 + }, + { + "epoch": 1.75225126290358, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.290062189102173, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.7100077867507935, + "num_tokens": 412521308.0, + "step": 15956 + }, + { + "epoch": 1.7523610806061938, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.293240547180176, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.7031744718551636, + "num_tokens": 412547588.0, + "step": 15957 + }, + { + "epoch": 1.7524708983088075, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.321187734603882, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6945964694023132, + "num_tokens": 412576696.0, + "step": 15958 + }, + { + "epoch": 1.752580716011421, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6822237968444824, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7001467347145081, + "num_tokens": 412598512.0, + "step": 15959 + }, + { + "epoch": 1.7526905337140346, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.449475049972534, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7080212831497192, + "num_tokens": 412624152.0, + "step": 15960 + }, + { + "epoch": 1.7528003514166484, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2799623012542725, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7309916019439697, + "num_tokens": 412649925.0, + "step": 15961 + }, + { + "epoch": 1.7529101691192621, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.8976895809173584, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6964520215988159, + "num_tokens": 412689788.0, + "step": 15962 + }, + { + "epoch": 1.7530199868218757, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5216946601867676, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7304925918579102, + "num_tokens": 412712857.0, + "step": 15963 + }, + { + "epoch": 1.7531298045244892, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5399277210235596, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6979629993438721, + "num_tokens": 412739070.0, + "step": 15964 + }, + { + "epoch": 1.753239622227103, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1481411457061768, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7139861583709717, + "num_tokens": 412768299.0, + "step": 15965 + }, + { + "epoch": 1.7533494399297167, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.60257625579834, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7069776058197021, + "num_tokens": 412790910.0, + "step": 15966 + }, + { + "epoch": 1.7534592576323305, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3461318016052246, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7146499752998352, + "num_tokens": 412816326.0, + "step": 15967 + }, + { + "epoch": 1.753569075334944, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.375701904296875, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.729589581489563, + "num_tokens": 412842083.0, + "step": 15968 + }, + { + "epoch": 1.7536788930375575, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.689397096633911, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7044894695281982, + "num_tokens": 412862592.0, + "step": 15969 + }, + { + "epoch": 1.7537887107401713, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6746666431427, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7115939855575562, + "num_tokens": 412886549.0, + "step": 15970 + }, + { + "epoch": 1.753898528442785, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.31912899017334, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6858547329902649, + "num_tokens": 412912929.0, + "step": 15971 + }, + { + "epoch": 1.7540083461453988, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5940487384796143, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7125146389007568, + "num_tokens": 412934306.0, + "step": 15972 + }, + { + "epoch": 1.7541181638480123, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.25032114982605, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7111194133758545, + "num_tokens": 412963595.0, + "step": 15973 + }, + { + "epoch": 1.7542279815506259, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1037189960479736, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7036980390548706, + "num_tokens": 412999685.0, + "step": 15974 + }, + { + "epoch": 1.7543377992532396, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.19870662689209, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7048245072364807, + "num_tokens": 413030652.0, + "step": 15975 + }, + { + "epoch": 1.7544476169558534, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.333831548690796, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6968644857406616, + "num_tokens": 413058995.0, + "step": 15976 + }, + { + "epoch": 1.754557434658467, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6814708709716797, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7019544839859009, + "num_tokens": 413081066.0, + "step": 15977 + }, + { + "epoch": 1.7546672523610805, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3531687259674072, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7129079103469849, + "num_tokens": 413109710.0, + "step": 15978 + }, + { + "epoch": 1.7547770700636942, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.433896541595459, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7203479409217834, + "num_tokens": 413133397.0, + "step": 15979 + }, + { + "epoch": 1.754886887766308, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0921459197998047, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.7015644311904907, + "num_tokens": 413165579.0, + "step": 15980 + }, + { + "epoch": 1.7549967054689217, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.505389451980591, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.725547730922699, + "num_tokens": 413190182.0, + "step": 15981 + }, + { + "epoch": 1.7551065231715353, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4140796661376953, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7076506614685059, + "num_tokens": 413214874.0, + "step": 15982 + }, + { + "epoch": 1.7552163408741488, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1780614852905273, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.706856369972229, + "num_tokens": 413244392.0, + "step": 15983 + }, + { + "epoch": 1.7553261585767626, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.198833465576172, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7081185579299927, + "num_tokens": 413276386.0, + "step": 15984 + }, + { + "epoch": 1.7554359762793763, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2914931774139404, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7139956951141357, + "num_tokens": 413304406.0, + "step": 15985 + }, + { + "epoch": 1.7555457939819898, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3412294387817383, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7152977585792542, + "num_tokens": 413329957.0, + "step": 15986 + }, + { + "epoch": 1.7556556116846036, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2785282135009766, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7088030576705933, + "num_tokens": 413355676.0, + "step": 15987 + }, + { + "epoch": 1.7557654293872171, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.405935287475586, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7150208950042725, + "num_tokens": 413377392.0, + "step": 15988 + }, + { + "epoch": 1.755875247089831, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4442145824432373, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7342768907546997, + "num_tokens": 413398830.0, + "step": 15989 + }, + { + "epoch": 1.7559850647924446, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1946675777435303, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6946803331375122, + "num_tokens": 413429855.0, + "step": 15990 + }, + { + "epoch": 1.7560948824950582, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.253854513168335, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7156842947006226, + "num_tokens": 413456455.0, + "step": 15991 + }, + { + "epoch": 1.7562047001976717, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5686378479003906, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7094650864601135, + "num_tokens": 413479740.0, + "step": 15992 + }, + { + "epoch": 1.7563145179002855, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.509747266769409, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6943362951278687, + "num_tokens": 413505148.0, + "step": 15993 + }, + { + "epoch": 1.7564243356028992, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4559099674224854, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7112576365470886, + "num_tokens": 413530087.0, + "step": 15994 + }, + { + "epoch": 1.756534153305513, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.476383924484253, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7274678349494934, + "num_tokens": 413556001.0, + "step": 15995 + }, + { + "epoch": 1.7566439710081265, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.500413417816162, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7381442189216614, + "num_tokens": 413578697.0, + "step": 15996 + }, + { + "epoch": 1.75675378871074, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.413912057876587, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6930444240570068, + "num_tokens": 413607179.0, + "step": 15997 + }, + { + "epoch": 1.7568636064133538, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2120115756988525, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7040053606033325, + "num_tokens": 413635705.0, + "step": 15998 + }, + { + "epoch": 1.7569734241159676, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4611690044403076, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7384721040725708, + "num_tokens": 413658062.0, + "step": 15999 + }, + { + "epoch": 1.757083241818581, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4750187397003174, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7287917733192444, + "num_tokens": 413679936.0, + "step": 16000 + }, + { + "epoch": 1.7571930595211949, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6777029037475586, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7084589004516602, + "num_tokens": 413700924.0, + "step": 16001 + }, + { + "epoch": 1.7573028772238084, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6624503135681152, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7210070490837097, + "num_tokens": 413720728.0, + "step": 16002 + }, + { + "epoch": 1.7574126949264222, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.320071220397949, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7278482913970947, + "num_tokens": 413747592.0, + "step": 16003 + }, + { + "epoch": 1.757522512629036, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1245436668395996, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7111510634422302, + "num_tokens": 413779379.0, + "step": 16004 + }, + { + "epoch": 1.7576323303316495, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5970447063446045, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7028915286064148, + "num_tokens": 413801675.0, + "step": 16005 + }, + { + "epoch": 1.757742148034263, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3771090507507324, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6962434649467468, + "num_tokens": 413828275.0, + "step": 16006 + }, + { + "epoch": 1.7578519657368767, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2180495262145996, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7147195339202881, + "num_tokens": 413859114.0, + "step": 16007 + }, + { + "epoch": 1.7579617834394905, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2093875408172607, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7255541682243347, + "num_tokens": 413886825.0, + "step": 16008 + }, + { + "epoch": 1.7580716011421043, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0192437171936035, + "learning_rate": 1e-06, + "loss": 1.1338, + "mean_token_accuracy": 0.6773402094841003, + "num_tokens": 413925084.0, + "step": 16009 + }, + { + "epoch": 1.7581814188447178, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.474001169204712, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6867577433586121, + "num_tokens": 413952206.0, + "step": 16010 + }, + { + "epoch": 1.7582912365473313, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.510629177093506, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6962230801582336, + "num_tokens": 413975722.0, + "step": 16011 + }, + { + "epoch": 1.758401054249945, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.330325126647949, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7275606393814087, + "num_tokens": 414000120.0, + "step": 16012 + }, + { + "epoch": 1.7585108719525588, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.604950189590454, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7131187319755554, + "num_tokens": 414022521.0, + "step": 16013 + }, + { + "epoch": 1.7586206896551724, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.360943555831909, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6957471966743469, + "num_tokens": 414047699.0, + "step": 16014 + }, + { + "epoch": 1.758730507357786, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5542185306549072, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7068751454353333, + "num_tokens": 414071095.0, + "step": 16015 + }, + { + "epoch": 1.7588403250603997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.387791156768799, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7271323204040527, + "num_tokens": 414096434.0, + "step": 16016 + }, + { + "epoch": 1.7589501427630134, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4724435806274414, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.71246737241745, + "num_tokens": 414120608.0, + "step": 16017 + }, + { + "epoch": 1.7590599604656272, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4891719818115234, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7513037919998169, + "num_tokens": 414142862.0, + "step": 16018 + }, + { + "epoch": 1.7591697781682407, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2106525897979736, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6916232705116272, + "num_tokens": 414173434.0, + "step": 16019 + }, + { + "epoch": 1.7592795958708543, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.441417932510376, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7255504131317139, + "num_tokens": 414199609.0, + "step": 16020 + }, + { + "epoch": 1.759389413573468, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.446870803833008, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.7031910419464111, + "num_tokens": 414225107.0, + "step": 16021 + }, + { + "epoch": 1.7594992312760818, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.213719367980957, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7275158166885376, + "num_tokens": 414251708.0, + "step": 16022 + }, + { + "epoch": 1.7596090489786955, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2133171558380127, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7418724894523621, + "num_tokens": 414279664.0, + "step": 16023 + }, + { + "epoch": 1.759718866681309, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2200703620910645, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6921746730804443, + "num_tokens": 414309844.0, + "step": 16024 + }, + { + "epoch": 1.7598286843839226, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4363279342651367, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.710227370262146, + "num_tokens": 414333773.0, + "step": 16025 + }, + { + "epoch": 1.7599385020865363, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.316129684448242, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.707325279712677, + "num_tokens": 414360364.0, + "step": 16026 + }, + { + "epoch": 1.76004831978915, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2966291904449463, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.736255407333374, + "num_tokens": 414387365.0, + "step": 16027 + }, + { + "epoch": 1.7601581374917636, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2905983924865723, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7205529808998108, + "num_tokens": 414413067.0, + "step": 16028 + }, + { + "epoch": 1.7602679551943772, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.360487699508667, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6989067792892456, + "num_tokens": 414440864.0, + "step": 16029 + }, + { + "epoch": 1.760377772896991, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.29734206199646, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.702226996421814, + "num_tokens": 414467877.0, + "step": 16030 + }, + { + "epoch": 1.7604875905996047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 7.001506328582764, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7285721898078918, + "num_tokens": 414492795.0, + "step": 16031 + }, + { + "epoch": 1.7605974083022184, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.361246109008789, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.721428394317627, + "num_tokens": 414519885.0, + "step": 16032 + }, + { + "epoch": 1.760707226004832, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.176203489303589, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7154803276062012, + "num_tokens": 414549805.0, + "step": 16033 + }, + { + "epoch": 1.7608170437074455, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.315603733062744, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7205392122268677, + "num_tokens": 414576865.0, + "step": 16034 + }, + { + "epoch": 1.7609268614100593, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.215250015258789, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7021541595458984, + "num_tokens": 414605167.0, + "step": 16035 + }, + { + "epoch": 1.761036679112673, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5947301387786865, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7157214283943176, + "num_tokens": 414626986.0, + "step": 16036 + }, + { + "epoch": 1.7611464968152868, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.328447103500366, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6983965635299683, + "num_tokens": 414655905.0, + "step": 16037 + }, + { + "epoch": 1.7612563145179003, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3509721755981445, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7023177146911621, + "num_tokens": 414682696.0, + "step": 16038 + }, + { + "epoch": 1.7613661322205139, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.306105136871338, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7132805585861206, + "num_tokens": 414709072.0, + "step": 16039 + }, + { + "epoch": 1.7614759499231276, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3541312217712402, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7305858135223389, + "num_tokens": 414734163.0, + "step": 16040 + }, + { + "epoch": 1.7615857676257414, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4847545623779297, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7330752611160278, + "num_tokens": 414757528.0, + "step": 16041 + }, + { + "epoch": 1.761695585328355, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.502856492996216, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7042257785797119, + "num_tokens": 414781026.0, + "step": 16042 + }, + { + "epoch": 1.7618054030309684, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2644283771514893, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7113780975341797, + "num_tokens": 414811374.0, + "step": 16043 + }, + { + "epoch": 1.7619152207335822, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.459479570388794, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7148781418800354, + "num_tokens": 414834105.0, + "step": 16044 + }, + { + "epoch": 1.762025038436196, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2581212520599365, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6973525285720825, + "num_tokens": 414863277.0, + "step": 16045 + }, + { + "epoch": 1.7621348561388097, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.43520188331604, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7202668190002441, + "num_tokens": 414888365.0, + "step": 16046 + }, + { + "epoch": 1.7622446738414232, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2905478477478027, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7221754789352417, + "num_tokens": 414914796.0, + "step": 16047 + }, + { + "epoch": 1.7623544915440368, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1375393867492676, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.7006103992462158, + "num_tokens": 414946541.0, + "step": 16048 + }, + { + "epoch": 1.7624643092466505, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5844805240631104, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7121890783309937, + "num_tokens": 414970385.0, + "step": 16049 + }, + { + "epoch": 1.7625741269492643, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2969584465026855, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7077059149742126, + "num_tokens": 414997241.0, + "step": 16050 + }, + { + "epoch": 1.7626839446518778, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3993561267852783, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7096066474914551, + "num_tokens": 415025244.0, + "step": 16051 + }, + { + "epoch": 1.7627937623544916, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2136034965515137, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7372387647628784, + "num_tokens": 415051255.0, + "step": 16052 + }, + { + "epoch": 1.7629035800571051, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1303470134735107, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7260568737983704, + "num_tokens": 415080247.0, + "step": 16053 + }, + { + "epoch": 1.7630133977597189, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2169394493103027, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6993288993835449, + "num_tokens": 415109155.0, + "step": 16054 + }, + { + "epoch": 1.7631232154623326, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.462646961212158, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.716389536857605, + "num_tokens": 415133469.0, + "step": 16055 + }, + { + "epoch": 1.7632330331649462, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5335066318511963, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7003105878829956, + "num_tokens": 415155715.0, + "step": 16056 + }, + { + "epoch": 1.7633428508675597, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.9489625692367554, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7020807266235352, + "num_tokens": 415192246.0, + "step": 16057 + }, + { + "epoch": 1.7634526685701735, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.53212571144104, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7114999294281006, + "num_tokens": 415219216.0, + "step": 16058 + }, + { + "epoch": 1.7635624862727872, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.53713059425354, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7153638601303101, + "num_tokens": 415245103.0, + "step": 16059 + }, + { + "epoch": 1.763672303975401, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2096447944641113, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6893688440322876, + "num_tokens": 415276220.0, + "step": 16060 + }, + { + "epoch": 1.7637821216780145, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3359217643737793, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7240368127822876, + "num_tokens": 415303037.0, + "step": 16061 + }, + { + "epoch": 1.763891939380628, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5579800605773926, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7096396684646606, + "num_tokens": 415324617.0, + "step": 16062 + }, + { + "epoch": 1.7640017570832418, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5833656787872314, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7094792723655701, + "num_tokens": 415349364.0, + "step": 16063 + }, + { + "epoch": 1.7641115747858556, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3465042114257812, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6990073919296265, + "num_tokens": 415376114.0, + "step": 16064 + }, + { + "epoch": 1.764221392488469, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4274518489837646, + "learning_rate": 1e-06, + "loss": 1.1315, + "mean_token_accuracy": 0.6739655137062073, + "num_tokens": 415406244.0, + "step": 16065 + }, + { + "epoch": 1.7643312101910829, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5543129444122314, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6952744722366333, + "num_tokens": 415428018.0, + "step": 16066 + }, + { + "epoch": 1.7644410278936964, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3720035552978516, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6939356327056885, + "num_tokens": 415455075.0, + "step": 16067 + }, + { + "epoch": 1.7645508455963101, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1946897506713867, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.694121778011322, + "num_tokens": 415485792.0, + "step": 16068 + }, + { + "epoch": 1.764660663298924, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3866126537323, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6961906552314758, + "num_tokens": 415514560.0, + "step": 16069 + }, + { + "epoch": 1.7647704810015374, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.659541606903076, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7126126885414124, + "num_tokens": 415537026.0, + "step": 16070 + }, + { + "epoch": 1.764880298704151, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.401045560836792, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6914201974868774, + "num_tokens": 415565677.0, + "step": 16071 + }, + { + "epoch": 1.7649901164067647, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3224008083343506, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7017368078231812, + "num_tokens": 415594587.0, + "step": 16072 + }, + { + "epoch": 1.7650999341093785, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2818140983581543, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7052643299102783, + "num_tokens": 415622755.0, + "step": 16073 + }, + { + "epoch": 1.7652097518119922, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1479783058166504, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.713347315788269, + "num_tokens": 415652436.0, + "step": 16074 + }, + { + "epoch": 1.7653195695146058, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.591975212097168, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7027865648269653, + "num_tokens": 415676475.0, + "step": 16075 + }, + { + "epoch": 1.7654293872172193, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.376152992248535, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6986850500106812, + "num_tokens": 415703020.0, + "step": 16076 + }, + { + "epoch": 1.765539204919833, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.319491386413574, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6997811794281006, + "num_tokens": 415729866.0, + "step": 16077 + }, + { + "epoch": 1.7656490226224468, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.581350088119507, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7169415950775146, + "num_tokens": 415752142.0, + "step": 16078 + }, + { + "epoch": 1.7657588403250604, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.352478504180908, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.7027187347412109, + "num_tokens": 415780469.0, + "step": 16079 + }, + { + "epoch": 1.765868658027674, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.299406051635742, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.697344958782196, + "num_tokens": 415809635.0, + "step": 16080 + }, + { + "epoch": 1.7659784757302877, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5681021213531494, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7043455839157104, + "num_tokens": 415832723.0, + "step": 16081 + }, + { + "epoch": 1.7660882934329014, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4249796867370605, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6938498616218567, + "num_tokens": 415857939.0, + "step": 16082 + }, + { + "epoch": 1.7661981111355152, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.469261407852173, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7086886167526245, + "num_tokens": 415881968.0, + "step": 16083 + }, + { + "epoch": 1.7663079288381287, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 6.98129940032959, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7042708396911621, + "num_tokens": 415911357.0, + "step": 16084 + }, + { + "epoch": 1.7664177465407422, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3163599967956543, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7228039503097534, + "num_tokens": 415936779.0, + "step": 16085 + }, + { + "epoch": 1.766527564243356, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.355560302734375, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6928688287734985, + "num_tokens": 415964168.0, + "step": 16086 + }, + { + "epoch": 1.7666373819459698, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4609158039093018, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7201927304267883, + "num_tokens": 415988411.0, + "step": 16087 + }, + { + "epoch": 1.7667471996485835, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1930174827575684, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6915419101715088, + "num_tokens": 416018363.0, + "step": 16088 + }, + { + "epoch": 1.766857017351197, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3378515243530273, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.72292160987854, + "num_tokens": 416046232.0, + "step": 16089 + }, + { + "epoch": 1.7669668350538106, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.330636978149414, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.726732611656189, + "num_tokens": 416073759.0, + "step": 16090 + }, + { + "epoch": 1.7670766527564243, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3818509578704834, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7059823870658875, + "num_tokens": 416098163.0, + "step": 16091 + }, + { + "epoch": 1.767186470459038, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1791470050811768, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7055655121803284, + "num_tokens": 416126423.0, + "step": 16092 + }, + { + "epoch": 1.7672962881616516, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.672807455062866, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7318991422653198, + "num_tokens": 416147882.0, + "step": 16093 + }, + { + "epoch": 1.7674061058642652, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5680787563323975, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7150722742080688, + "num_tokens": 416169273.0, + "step": 16094 + }, + { + "epoch": 1.767515923566879, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.498796224594116, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6935409903526306, + "num_tokens": 416192765.0, + "step": 16095 + }, + { + "epoch": 1.7676257412694927, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4677822589874268, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7168845534324646, + "num_tokens": 416216952.0, + "step": 16096 + }, + { + "epoch": 1.7677355589721064, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3887100219726562, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7199288606643677, + "num_tokens": 416243681.0, + "step": 16097 + }, + { + "epoch": 1.76784537667472, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4169390201568604, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7214399576187134, + "num_tokens": 416270792.0, + "step": 16098 + }, + { + "epoch": 1.7679551943773335, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.364931344985962, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7013739347457886, + "num_tokens": 416298995.0, + "step": 16099 + }, + { + "epoch": 1.7680650120799473, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2422938346862793, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7183545827865601, + "num_tokens": 416324419.0, + "step": 16100 + }, + { + "epoch": 1.768174829782561, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.664151906967163, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7188053131103516, + "num_tokens": 416347089.0, + "step": 16101 + }, + { + "epoch": 1.7682846474851748, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5456600189208984, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6835132837295532, + "num_tokens": 416372595.0, + "step": 16102 + }, + { + "epoch": 1.7683944651877883, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.674473762512207, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7139963507652283, + "num_tokens": 416393401.0, + "step": 16103 + }, + { + "epoch": 1.7685042828904018, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6267385482788086, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7178183794021606, + "num_tokens": 416416802.0, + "step": 16104 + }, + { + "epoch": 1.7686141005930156, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.384427785873413, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6975224614143372, + "num_tokens": 416446841.0, + "step": 16105 + }, + { + "epoch": 1.7687239182956294, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4760336875915527, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7291947603225708, + "num_tokens": 416471615.0, + "step": 16106 + }, + { + "epoch": 1.768833735998243, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.303579330444336, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6990658044815063, + "num_tokens": 416500878.0, + "step": 16107 + }, + { + "epoch": 1.7689435537008564, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5243711471557617, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7233151197433472, + "num_tokens": 416524480.0, + "step": 16108 + }, + { + "epoch": 1.7690533714034702, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4603383541107178, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7231228351593018, + "num_tokens": 416549422.0, + "step": 16109 + }, + { + "epoch": 1.769163189106084, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.423168897628784, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7272891998291016, + "num_tokens": 416574267.0, + "step": 16110 + }, + { + "epoch": 1.7692730068086977, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3059163093566895, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6971392631530762, + "num_tokens": 416602495.0, + "step": 16111 + }, + { + "epoch": 1.7693828245113112, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2057759761810303, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7001991271972656, + "num_tokens": 416631999.0, + "step": 16112 + }, + { + "epoch": 1.7694926422139248, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2224464416503906, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6994113922119141, + "num_tokens": 416664289.0, + "step": 16113 + }, + { + "epoch": 1.7696024599165385, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.681373119354248, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7082329988479614, + "num_tokens": 416686554.0, + "step": 16114 + }, + { + "epoch": 1.7697122776191523, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3638012409210205, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7034459114074707, + "num_tokens": 416715570.0, + "step": 16115 + }, + { + "epoch": 1.7698220953217658, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3042521476745605, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6990442276000977, + "num_tokens": 416743489.0, + "step": 16116 + }, + { + "epoch": 1.7699319130243796, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2447404861450195, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7115643620491028, + "num_tokens": 416771487.0, + "step": 16117 + }, + { + "epoch": 1.7700417307269931, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.242859125137329, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7084189653396606, + "num_tokens": 416801416.0, + "step": 16118 + }, + { + "epoch": 1.7701515484296069, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.557028293609619, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7206032872200012, + "num_tokens": 416823188.0, + "step": 16119 + }, + { + "epoch": 1.7702613661322206, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.39115571975708, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7112513780593872, + "num_tokens": 416849104.0, + "step": 16120 + }, + { + "epoch": 1.7703711838348342, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3528122901916504, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6948977708816528, + "num_tokens": 416876435.0, + "step": 16121 + }, + { + "epoch": 1.7704810015374477, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.08369517326355, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.706484317779541, + "num_tokens": 416910832.0, + "step": 16122 + }, + { + "epoch": 1.7705908192400615, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.393160104751587, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6968750953674316, + "num_tokens": 416939844.0, + "step": 16123 + }, + { + "epoch": 1.7707006369426752, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2935376167297363, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7225526571273804, + "num_tokens": 416966003.0, + "step": 16124 + }, + { + "epoch": 1.770810454645289, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.647993564605713, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7137424945831299, + "num_tokens": 416985736.0, + "step": 16125 + }, + { + "epoch": 1.7709202723479025, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5687596797943115, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7120575308799744, + "num_tokens": 417007408.0, + "step": 16126 + }, + { + "epoch": 1.771030090050516, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.0643324851989746, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7259923219680786, + "num_tokens": 417039428.0, + "step": 16127 + }, + { + "epoch": 1.7711399077531298, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3950035572052, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7354785799980164, + "num_tokens": 417062279.0, + "step": 16128 + }, + { + "epoch": 1.7712497254557436, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.428276300430298, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7203294634819031, + "num_tokens": 417086262.0, + "step": 16129 + }, + { + "epoch": 1.771359543158357, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.289489984512329, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6997309327125549, + "num_tokens": 417113676.0, + "step": 16130 + }, + { + "epoch": 1.7714693608609708, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4305078983306885, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7124696969985962, + "num_tokens": 417137542.0, + "step": 16131 + }, + { + "epoch": 1.7715791785635844, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6083743572235107, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7090879678726196, + "num_tokens": 417160678.0, + "step": 16132 + }, + { + "epoch": 1.7716889962661981, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.613723039627075, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6989894509315491, + "num_tokens": 417183766.0, + "step": 16133 + }, + { + "epoch": 1.771798813968812, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.6167855262756348, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6876431703567505, + "num_tokens": 417207715.0, + "step": 16134 + }, + { + "epoch": 1.7719086316714254, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5510430335998535, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7171597480773926, + "num_tokens": 417232832.0, + "step": 16135 + }, + { + "epoch": 1.772018449374039, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1739003658294678, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7096340656280518, + "num_tokens": 417262091.0, + "step": 16136 + }, + { + "epoch": 1.7721282670766527, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.216353416442871, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7007699608802795, + "num_tokens": 417293057.0, + "step": 16137 + }, + { + "epoch": 1.7722380847792665, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 7.081794261932373, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7139061093330383, + "num_tokens": 417317893.0, + "step": 16138 + }, + { + "epoch": 1.7723479024818802, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3226397037506104, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.720872700214386, + "num_tokens": 417345068.0, + "step": 16139 + }, + { + "epoch": 1.7724577201844938, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.330643892288208, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7207790613174438, + "num_tokens": 417372931.0, + "step": 16140 + }, + { + "epoch": 1.7725675378871073, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.4619109630584717, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7057763338088989, + "num_tokens": 417395646.0, + "step": 16141 + }, + { + "epoch": 1.772677355589721, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2128968238830566, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.7007014751434326, + "num_tokens": 417426697.0, + "step": 16142 + }, + { + "epoch": 1.7727871732923348, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.546863317489624, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7362627387046814, + "num_tokens": 417448738.0, + "step": 16143 + }, + { + "epoch": 1.7728969909949484, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4526546001434326, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7207809686660767, + "num_tokens": 417472669.0, + "step": 16144 + }, + { + "epoch": 1.7730068086975619, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2580795288085938, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7193177938461304, + "num_tokens": 417500389.0, + "step": 16145 + }, + { + "epoch": 1.7731166264001756, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.2770864963531494, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6945847272872925, + "num_tokens": 417529568.0, + "step": 16146 + }, + { + "epoch": 1.7732264441027894, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.353348970413208, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7111095786094666, + "num_tokens": 417554658.0, + "step": 16147 + }, + { + "epoch": 1.7733362618054032, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.120954751968384, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.704074501991272, + "num_tokens": 417587294.0, + "step": 16148 + }, + { + "epoch": 1.7734460795080167, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3768138885498047, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7120797634124756, + "num_tokens": 417612958.0, + "step": 16149 + }, + { + "epoch": 1.7735558972106302, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.5885982513427734, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7193707227706909, + "num_tokens": 417634148.0, + "step": 16150 + }, + { + "epoch": 1.773665714913244, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.312960386276245, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7041601538658142, + "num_tokens": 417659456.0, + "step": 16151 + }, + { + "epoch": 1.7737755326158577, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.254352331161499, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6947852969169617, + "num_tokens": 417687557.0, + "step": 16152 + }, + { + "epoch": 1.7738853503184715, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.3605527877807617, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7059463262557983, + "num_tokens": 417712667.0, + "step": 16153 + }, + { + "epoch": 1.773995168021085, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.457282543182373, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7304845452308655, + "num_tokens": 417735849.0, + "step": 16154 + }, + { + "epoch": 1.7741049857236986, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.278637170791626, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6935352087020874, + "num_tokens": 417765359.0, + "step": 16155 + }, + { + "epoch": 1.7742148034263123, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.1186654567718506, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6940478086471558, + "num_tokens": 417797089.0, + "step": 16156 + }, + { + "epoch": 1.774324621128926, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4296398162841797, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7151524424552917, + "num_tokens": 417820062.0, + "step": 16157 + }, + { + "epoch": 1.7744344388315396, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3644063472747803, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6894797086715698, + "num_tokens": 417848752.0, + "step": 16158 + }, + { + "epoch": 1.7745442565341532, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1191537380218506, + "learning_rate": 1e-06, + "loss": 1.1157, + "mean_token_accuracy": 0.6764560341835022, + "num_tokens": 417881739.0, + "step": 16159 + }, + { + "epoch": 1.774654074236767, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2291648387908936, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.708402693271637, + "num_tokens": 417913173.0, + "step": 16160 + }, + { + "epoch": 1.7747638919393807, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 2.309835195541382, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7113150954246521, + "num_tokens": 417939617.0, + "step": 16161 + }, + { + "epoch": 1.7748737096419944, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3705155849456787, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7157912850379944, + "num_tokens": 417963686.0, + "step": 16162 + }, + { + "epoch": 1.774983527344608, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2725210189819336, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6995936036109924, + "num_tokens": 417992361.0, + "step": 16163 + }, + { + "epoch": 1.7750933450472215, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3439853191375732, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7344648838043213, + "num_tokens": 418017981.0, + "step": 16164 + }, + { + "epoch": 1.7752031627498353, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2998270988464355, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7289406657218933, + "num_tokens": 418043341.0, + "step": 16165 + }, + { + "epoch": 1.775312980452449, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3091561794281006, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6864268183708191, + "num_tokens": 418069128.0, + "step": 16166 + }, + { + "epoch": 1.7754227981550625, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6135969161987305, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7167284488677979, + "num_tokens": 418090271.0, + "step": 16167 + }, + { + "epoch": 1.7755326158576763, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.481793165206909, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7257628440856934, + "num_tokens": 418113119.0, + "step": 16168 + }, + { + "epoch": 1.7756424335602898, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3471717834472656, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7151254415512085, + "num_tokens": 418138183.0, + "step": 16169 + }, + { + "epoch": 1.7757522512629036, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.173861026763916, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6883896589279175, + "num_tokens": 418171349.0, + "step": 16170 + }, + { + "epoch": 1.7758620689655173, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1062545776367188, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7110220789909363, + "num_tokens": 418201351.0, + "step": 16171 + }, + { + "epoch": 1.7759718866681309, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3803977966308594, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7259513139724731, + "num_tokens": 418226757.0, + "step": 16172 + }, + { + "epoch": 1.7760817043707444, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.495476245880127, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.715937614440918, + "num_tokens": 418250369.0, + "step": 16173 + }, + { + "epoch": 1.7761915220733582, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.673007011413574, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7438767552375793, + "num_tokens": 418271077.0, + "step": 16174 + }, + { + "epoch": 1.776301339775972, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6010570526123047, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7270568609237671, + "num_tokens": 418292164.0, + "step": 16175 + }, + { + "epoch": 1.7764111574785857, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5529379844665527, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7189032435417175, + "num_tokens": 418315319.0, + "step": 16176 + }, + { + "epoch": 1.7765209751811992, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.481814384460449, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6935629844665527, + "num_tokens": 418339650.0, + "step": 16177 + }, + { + "epoch": 1.7766307928838128, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.131664514541626, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.710374116897583, + "num_tokens": 418370376.0, + "step": 16178 + }, + { + "epoch": 1.7767406105864265, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.388392686843872, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6917082667350769, + "num_tokens": 418395633.0, + "step": 16179 + }, + { + "epoch": 1.7768504282890403, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.520409345626831, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7144662737846375, + "num_tokens": 418417308.0, + "step": 16180 + }, + { + "epoch": 1.7769602459916538, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 7.044367790222168, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7153998017311096, + "num_tokens": 418441138.0, + "step": 16181 + }, + { + "epoch": 1.7770700636942676, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2538933753967285, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6921864748001099, + "num_tokens": 418470919.0, + "step": 16182 + }, + { + "epoch": 1.777179881396881, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.943898916244507, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7055890560150146, + "num_tokens": 418493758.0, + "step": 16183 + }, + { + "epoch": 1.7772896990994949, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.377267837524414, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6933652758598328, + "num_tokens": 418520258.0, + "step": 16184 + }, + { + "epoch": 1.7773995168021086, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4429378509521484, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7280068397521973, + "num_tokens": 418544191.0, + "step": 16185 + }, + { + "epoch": 1.7775093345047221, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.608187198638916, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7517956495285034, + "num_tokens": 418564807.0, + "step": 16186 + }, + { + "epoch": 1.7776191522073357, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.308732032775879, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7306495904922485, + "num_tokens": 418590708.0, + "step": 16187 + }, + { + "epoch": 1.7777289699099494, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4162585735321045, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7014783620834351, + "num_tokens": 418617068.0, + "step": 16188 + }, + { + "epoch": 1.7778387876125632, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.413031816482544, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7138686776161194, + "num_tokens": 418640811.0, + "step": 16189 + }, + { + "epoch": 1.777948605315177, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3486204147338867, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7112051248550415, + "num_tokens": 418667841.0, + "step": 16190 + }, + { + "epoch": 1.7780584230177905, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1690316200256348, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7114777565002441, + "num_tokens": 418696645.0, + "step": 16191 + }, + { + "epoch": 1.778168240720404, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1874821186065674, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7194645404815674, + "num_tokens": 418726463.0, + "step": 16192 + }, + { + "epoch": 1.7782780584230178, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.18605375289917, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7033655643463135, + "num_tokens": 418755149.0, + "step": 16193 + }, + { + "epoch": 1.7783878761256315, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.496616840362549, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7188801765441895, + "num_tokens": 418778806.0, + "step": 16194 + }, + { + "epoch": 1.778497693828245, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5513267517089844, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7284673452377319, + "num_tokens": 418799077.0, + "step": 16195 + }, + { + "epoch": 1.7786075115308586, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4432213306427, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7237240076065063, + "num_tokens": 418823886.0, + "step": 16196 + }, + { + "epoch": 1.7787173292334724, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.323930263519287, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7033382654190063, + "num_tokens": 418850668.0, + "step": 16197 + }, + { + "epoch": 1.7788271469360861, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.304253578186035, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7142707109451294, + "num_tokens": 418876540.0, + "step": 16198 + }, + { + "epoch": 1.7789369646386999, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.765974283218384, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7486436367034912, + "num_tokens": 418894376.0, + "step": 16199 + }, + { + "epoch": 1.7790467823413134, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.43145751953125, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6968299150466919, + "num_tokens": 418919237.0, + "step": 16200 + }, + { + "epoch": 1.779156600043927, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1083106994628906, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.679219126701355, + "num_tokens": 418954808.0, + "step": 16201 + }, + { + "epoch": 1.7792664177465407, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5639421939849854, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7046384215354919, + "num_tokens": 418979222.0, + "step": 16202 + }, + { + "epoch": 1.7793762354491545, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.347397804260254, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7249239087104797, + "num_tokens": 419005298.0, + "step": 16203 + }, + { + "epoch": 1.7794860531517682, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4485223293304443, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.704768717288971, + "num_tokens": 419031203.0, + "step": 16204 + }, + { + "epoch": 1.7795958708543818, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3643124103546143, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7287425994873047, + "num_tokens": 419056050.0, + "step": 16205 + }, + { + "epoch": 1.7797056885569953, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.43434476852417, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7045137882232666, + "num_tokens": 419085664.0, + "step": 16206 + }, + { + "epoch": 1.779815506259609, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6778502464294434, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7102099061012268, + "num_tokens": 419107035.0, + "step": 16207 + }, + { + "epoch": 1.7799253239622228, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.375930070877075, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7092971205711365, + "num_tokens": 419134962.0, + "step": 16208 + }, + { + "epoch": 1.7800351416648363, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6889586448669434, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7119146585464478, + "num_tokens": 419156469.0, + "step": 16209 + }, + { + "epoch": 1.7801449593674499, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5700695514678955, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.702399730682373, + "num_tokens": 419180412.0, + "step": 16210 + }, + { + "epoch": 1.7802547770700636, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.248490571975708, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7074227333068848, + "num_tokens": 419209102.0, + "step": 16211 + }, + { + "epoch": 1.7803645947726774, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7024049758911133, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7226264476776123, + "num_tokens": 419230360.0, + "step": 16212 + }, + { + "epoch": 1.7804744124752911, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1841580867767334, + "learning_rate": 1e-06, + "loss": 1.1362, + "mean_token_accuracy": 0.6675618290901184, + "num_tokens": 419263585.0, + "step": 16213 + }, + { + "epoch": 1.7805842301779047, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4668054580688477, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7032734155654907, + "num_tokens": 419290289.0, + "step": 16214 + }, + { + "epoch": 1.7806940478805182, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.325988292694092, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6975282430648804, + "num_tokens": 419315191.0, + "step": 16215 + }, + { + "epoch": 1.780803865583132, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1456069946289062, + "learning_rate": 1e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.6772100329399109, + "num_tokens": 419345979.0, + "step": 16216 + }, + { + "epoch": 1.7809136832857457, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.416743040084839, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7341770529747009, + "num_tokens": 419369204.0, + "step": 16217 + }, + { + "epoch": 1.7810235009883595, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4087090492248535, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7290864586830139, + "num_tokens": 419393122.0, + "step": 16218 + }, + { + "epoch": 1.781133318690973, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3215115070343018, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7033189535140991, + "num_tokens": 419420255.0, + "step": 16219 + }, + { + "epoch": 1.7812431363935866, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3625166416168213, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7084819078445435, + "num_tokens": 419445421.0, + "step": 16220 + }, + { + "epoch": 1.7813529540962003, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3547682762145996, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6911335587501526, + "num_tokens": 419473130.0, + "step": 16221 + }, + { + "epoch": 1.781462771798814, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5463595390319824, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7038358449935913, + "num_tokens": 419494795.0, + "step": 16222 + }, + { + "epoch": 1.7815725895014276, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.584310531616211, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6989967823028564, + "num_tokens": 419519442.0, + "step": 16223 + }, + { + "epoch": 1.7816824072040411, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5271780490875244, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7044880390167236, + "num_tokens": 419543835.0, + "step": 16224 + }, + { + "epoch": 1.781792224906655, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6944894790649414, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.710250973701477, + "num_tokens": 419565539.0, + "step": 16225 + }, + { + "epoch": 1.7819020426092687, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3844165802001953, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7144025564193726, + "num_tokens": 419589785.0, + "step": 16226 + }, + { + "epoch": 1.7820118603118824, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.269784927368164, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.74493008852005, + "num_tokens": 419614859.0, + "step": 16227 + }, + { + "epoch": 1.782121678014496, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.17169451713562, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7255114912986755, + "num_tokens": 419645316.0, + "step": 16228 + }, + { + "epoch": 1.7822314957171095, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.467283010482788, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7262847423553467, + "num_tokens": 419669390.0, + "step": 16229 + }, + { + "epoch": 1.7823413134197232, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5895557403564453, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6957197189331055, + "num_tokens": 419692156.0, + "step": 16230 + }, + { + "epoch": 1.782451131122337, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.408151626586914, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7157320976257324, + "num_tokens": 419718398.0, + "step": 16231 + }, + { + "epoch": 1.7825609488249505, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.331000804901123, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7191756963729858, + "num_tokens": 419743950.0, + "step": 16232 + }, + { + "epoch": 1.7826707665275643, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7124953269958496, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7323160767555237, + "num_tokens": 419764640.0, + "step": 16233 + }, + { + "epoch": 1.7827805842301778, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3807432651519775, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7133677005767822, + "num_tokens": 419792119.0, + "step": 16234 + }, + { + "epoch": 1.7828904019327916, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.22702956199646, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.716607391834259, + "num_tokens": 419820532.0, + "step": 16235 + }, + { + "epoch": 1.7830002196354053, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4173128604888916, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7357534766197205, + "num_tokens": 419845548.0, + "step": 16236 + }, + { + "epoch": 1.7831100373380189, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5564582347869873, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7146875858306885, + "num_tokens": 419867079.0, + "step": 16237 + }, + { + "epoch": 1.7832198550406324, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5343613624572754, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.715983510017395, + "num_tokens": 419891368.0, + "step": 16238 + }, + { + "epoch": 1.7833296727432462, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4714112281799316, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.7059880495071411, + "num_tokens": 419915401.0, + "step": 16239 + }, + { + "epoch": 1.78343949044586, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3153328895568848, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7101433277130127, + "num_tokens": 419941102.0, + "step": 16240 + }, + { + "epoch": 1.7835493081484737, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.847379684448242, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.718749463558197, + "num_tokens": 419960710.0, + "step": 16241 + }, + { + "epoch": 1.7836591258510872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6079652309417725, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7228602766990662, + "num_tokens": 419979796.0, + "step": 16242 + }, + { + "epoch": 1.7837689435537007, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6998095512390137, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7163605690002441, + "num_tokens": 419999905.0, + "step": 16243 + }, + { + "epoch": 1.7838787612563145, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2219960689544678, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7048390507698059, + "num_tokens": 420029631.0, + "step": 16244 + }, + { + "epoch": 1.7839885789589283, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5770530700683594, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7187472581863403, + "num_tokens": 420051474.0, + "step": 16245 + }, + { + "epoch": 1.7840983966615418, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6939783096313477, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7151724100112915, + "num_tokens": 420071650.0, + "step": 16246 + }, + { + "epoch": 1.7842082143641556, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.591447591781616, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7091879844665527, + "num_tokens": 420094751.0, + "step": 16247 + }, + { + "epoch": 1.784318032066769, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.370460271835327, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7165063619613647, + "num_tokens": 420121656.0, + "step": 16248 + }, + { + "epoch": 1.7844278497693828, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4869272708892822, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7237274646759033, + "num_tokens": 420146275.0, + "step": 16249 + }, + { + "epoch": 1.7845376674719966, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.40309476852417, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6938443779945374, + "num_tokens": 420173068.0, + "step": 16250 + }, + { + "epoch": 1.7846474851746101, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1855738162994385, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7080425024032593, + "num_tokens": 420202313.0, + "step": 16251 + }, + { + "epoch": 1.7847573028772237, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5645830631256104, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7382587194442749, + "num_tokens": 420225258.0, + "step": 16252 + }, + { + "epoch": 1.7848671205798374, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.220395088195801, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7106928825378418, + "num_tokens": 420254837.0, + "step": 16253 + }, + { + "epoch": 1.7849769382824512, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 3.7159430980682373, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6916821002960205, + "num_tokens": 420284882.0, + "step": 16254 + }, + { + "epoch": 1.785086755985065, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.369314432144165, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.706991970539093, + "num_tokens": 420312526.0, + "step": 16255 + }, + { + "epoch": 1.7851965736876785, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3060989379882812, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7030130624771118, + "num_tokens": 420341823.0, + "step": 16256 + }, + { + "epoch": 1.785306391390292, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.15531849861145, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7121467590332031, + "num_tokens": 420372663.0, + "step": 16257 + }, + { + "epoch": 1.7854162090929058, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.371570348739624, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7015707492828369, + "num_tokens": 420396983.0, + "step": 16258 + }, + { + "epoch": 1.7855260267955195, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.374962568283081, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7114394307136536, + "num_tokens": 420421503.0, + "step": 16259 + }, + { + "epoch": 1.785635844498133, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3338615894317627, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6915936470031738, + "num_tokens": 420451332.0, + "step": 16260 + }, + { + "epoch": 1.7857456622007466, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2024149894714355, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7065103054046631, + "num_tokens": 420481896.0, + "step": 16261 + }, + { + "epoch": 1.7858554799033604, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3491430282592773, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7209559679031372, + "num_tokens": 420507325.0, + "step": 16262 + }, + { + "epoch": 1.7859652976059741, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2967782020568848, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7445686459541321, + "num_tokens": 420532423.0, + "step": 16263 + }, + { + "epoch": 1.7860751153085879, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.332213878631592, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7110218405723572, + "num_tokens": 420558188.0, + "step": 16264 + }, + { + "epoch": 1.7861849330112014, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2974371910095215, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7195558547973633, + "num_tokens": 420583440.0, + "step": 16265 + }, + { + "epoch": 1.786294750713815, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.319739818572998, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.720890998840332, + "num_tokens": 420608649.0, + "step": 16266 + }, + { + "epoch": 1.7864045684164287, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.8703012466430664, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7164536714553833, + "num_tokens": 420627120.0, + "step": 16267 + }, + { + "epoch": 1.7865143861190425, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2377429008483887, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7017677426338196, + "num_tokens": 420656474.0, + "step": 16268 + }, + { + "epoch": 1.7866242038216562, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3081133365631104, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7243499755859375, + "num_tokens": 420683699.0, + "step": 16269 + }, + { + "epoch": 1.7867340215242697, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.339808702468872, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7044212818145752, + "num_tokens": 420708766.0, + "step": 16270 + }, + { + "epoch": 1.7868438392268833, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1688969135284424, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6878339052200317, + "num_tokens": 420741260.0, + "step": 16271 + }, + { + "epoch": 1.786953656929497, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3050994873046875, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.7092975974082947, + "num_tokens": 420768544.0, + "step": 16272 + }, + { + "epoch": 1.7870634746321108, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4908132553100586, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7334397435188293, + "num_tokens": 420790272.0, + "step": 16273 + }, + { + "epoch": 1.7871732923347243, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.260636568069458, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6999208927154541, + "num_tokens": 420818546.0, + "step": 16274 + }, + { + "epoch": 1.7872831100373379, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.18377947807312, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6919509172439575, + "num_tokens": 420848155.0, + "step": 16275 + }, + { + "epoch": 1.7873929277399516, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2950758934020996, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7186238765716553, + "num_tokens": 420873991.0, + "step": 16276 + }, + { + "epoch": 1.7875027454425654, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.370032787322998, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7112271189689636, + "num_tokens": 420899750.0, + "step": 16277 + }, + { + "epoch": 1.7876125631451791, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.351874351501465, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7068490386009216, + "num_tokens": 420926433.0, + "step": 16278 + }, + { + "epoch": 1.7877223808477927, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2960214614868164, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.721442699432373, + "num_tokens": 420951268.0, + "step": 16279 + }, + { + "epoch": 1.7878321985504062, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1619722843170166, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7087850570678711, + "num_tokens": 420981931.0, + "step": 16280 + }, + { + "epoch": 1.78794201625302, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.543398380279541, + "learning_rate": 1e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6805294752120972, + "num_tokens": 421007442.0, + "step": 16281 + }, + { + "epoch": 1.7880518339556337, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3020620346069336, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7022644281387329, + "num_tokens": 421034920.0, + "step": 16282 + }, + { + "epoch": 1.7881616516582475, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.122162342071533, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6942421793937683, + "num_tokens": 421066890.0, + "step": 16283 + }, + { + "epoch": 1.788271469360861, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5127222537994385, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.690972089767456, + "num_tokens": 421090461.0, + "step": 16284 + }, + { + "epoch": 1.7883812870634745, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6240196228027344, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7123869061470032, + "num_tokens": 421112929.0, + "step": 16285 + }, + { + "epoch": 1.7884911047660883, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4389407634735107, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.714002788066864, + "num_tokens": 421137933.0, + "step": 16286 + }, + { + "epoch": 1.788600922468702, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3234269618988037, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.714341402053833, + "num_tokens": 421165253.0, + "step": 16287 + }, + { + "epoch": 1.7887107401713156, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3692502975463867, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.737995982170105, + "num_tokens": 421187552.0, + "step": 16288 + }, + { + "epoch": 1.7888205578739291, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3131675720214844, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7199267148971558, + "num_tokens": 421213194.0, + "step": 16289 + }, + { + "epoch": 1.7889303755765429, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4298365116119385, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7096318602561951, + "num_tokens": 421236247.0, + "step": 16290 + }, + { + "epoch": 1.7890401932791566, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.738523244857788, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7169358730316162, + "num_tokens": 421257100.0, + "step": 16291 + }, + { + "epoch": 1.7891500109817704, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3377299308776855, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7215694785118103, + "num_tokens": 421280175.0, + "step": 16292 + }, + { + "epoch": 1.789259828684384, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5845859050750732, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7417181730270386, + "num_tokens": 421300730.0, + "step": 16293 + }, + { + "epoch": 1.7893696463869975, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.487793207168579, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7333850264549255, + "num_tokens": 421323715.0, + "step": 16294 + }, + { + "epoch": 1.7894794640896112, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6212480068206787, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7008628249168396, + "num_tokens": 421346426.0, + "step": 16295 + }, + { + "epoch": 1.789589281792225, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5134646892547607, + "learning_rate": 1e-06, + "loss": 1.0954, + "mean_token_accuracy": 0.681666374206543, + "num_tokens": 421371371.0, + "step": 16296 + }, + { + "epoch": 1.7896990994948385, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.442727565765381, + "learning_rate": 1e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6957454681396484, + "num_tokens": 421396099.0, + "step": 16297 + }, + { + "epoch": 1.7898089171974523, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3211166858673096, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7130870819091797, + "num_tokens": 421424550.0, + "step": 16298 + }, + { + "epoch": 1.7899187349000658, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.527968645095825, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7303537130355835, + "num_tokens": 421446253.0, + "step": 16299 + }, + { + "epoch": 1.7900285526026796, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.095637559890747, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7043233513832092, + "num_tokens": 421480936.0, + "step": 16300 + }, + { + "epoch": 1.7901383703052933, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.568267822265625, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7096467018127441, + "num_tokens": 421501546.0, + "step": 16301 + }, + { + "epoch": 1.7902481880079069, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6036953926086426, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7080102562904358, + "num_tokens": 421523198.0, + "step": 16302 + }, + { + "epoch": 1.7903580057105204, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3229269981384277, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7124994397163391, + "num_tokens": 421549623.0, + "step": 16303 + }, + { + "epoch": 1.7904678234131342, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4140963554382324, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6881281137466431, + "num_tokens": 421574848.0, + "step": 16304 + }, + { + "epoch": 1.790577641115748, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.565042734146118, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7124724984169006, + "num_tokens": 421596570.0, + "step": 16305 + }, + { + "epoch": 1.7906874588183617, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1822545528411865, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7024565935134888, + "num_tokens": 421626890.0, + "step": 16306 + }, + { + "epoch": 1.7907972765209752, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.421921730041504, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7157217860221863, + "num_tokens": 421652410.0, + "step": 16307 + }, + { + "epoch": 1.7909070942235887, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.449608564376831, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7030630707740784, + "num_tokens": 421677817.0, + "step": 16308 + }, + { + "epoch": 1.7910169119262025, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.388038396835327, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7236019968986511, + "num_tokens": 421701895.0, + "step": 16309 + }, + { + "epoch": 1.7911267296288162, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.352856159210205, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7134271860122681, + "num_tokens": 421727887.0, + "step": 16310 + }, + { + "epoch": 1.7912365473314298, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3108386993408203, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.70964515209198, + "num_tokens": 421754232.0, + "step": 16311 + }, + { + "epoch": 1.7913463650340435, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.722946882247925, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7035582065582275, + "num_tokens": 421774594.0, + "step": 16312 + }, + { + "epoch": 1.791456182736657, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.527716636657715, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6950825452804565, + "num_tokens": 421799392.0, + "step": 16313 + }, + { + "epoch": 1.7915660004392708, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.427140951156616, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6963456869125366, + "num_tokens": 421827295.0, + "step": 16314 + }, + { + "epoch": 1.7916758181418846, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.387691020965576, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7076228260993958, + "num_tokens": 421853525.0, + "step": 16315 + }, + { + "epoch": 1.7917856358444981, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3176004886627197, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7261731028556824, + "num_tokens": 421880922.0, + "step": 16316 + }, + { + "epoch": 1.7918954535471117, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.533193588256836, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7210903763771057, + "num_tokens": 421905972.0, + "step": 16317 + }, + { + "epoch": 1.7920052712497254, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.502254009246826, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7095657587051392, + "num_tokens": 421928724.0, + "step": 16318 + }, + { + "epoch": 1.7921150889523392, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.249412775039673, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7065262198448181, + "num_tokens": 421956659.0, + "step": 16319 + }, + { + "epoch": 1.792224906654953, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4636526107788086, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7027417421340942, + "num_tokens": 421979564.0, + "step": 16320 + }, + { + "epoch": 1.7923347243575665, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5173704624176025, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7458456754684448, + "num_tokens": 422000886.0, + "step": 16321 + }, + { + "epoch": 1.79244454206018, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.354482889175415, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6980319619178772, + "num_tokens": 422027894.0, + "step": 16322 + }, + { + "epoch": 1.7925543597627938, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.536372661590576, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7090575695037842, + "num_tokens": 422050366.0, + "step": 16323 + }, + { + "epoch": 1.7926641774654075, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.167081356048584, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7153595685958862, + "num_tokens": 422078367.0, + "step": 16324 + }, + { + "epoch": 1.792773995168021, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.381981611251831, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7212082147598267, + "num_tokens": 422102979.0, + "step": 16325 + }, + { + "epoch": 1.7928838128706346, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5303122997283936, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7149655818939209, + "num_tokens": 422127190.0, + "step": 16326 + }, + { + "epoch": 1.7929936305732483, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5075900554656982, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6975992321968079, + "num_tokens": 422151054.0, + "step": 16327 + }, + { + "epoch": 1.793103448275862, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2823870182037354, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.729875385761261, + "num_tokens": 422176163.0, + "step": 16328 + }, + { + "epoch": 1.7932132659784759, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3351640701293945, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7069737315177917, + "num_tokens": 422200960.0, + "step": 16329 + }, + { + "epoch": 1.7933230836810894, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1172327995300293, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7205322980880737, + "num_tokens": 422229630.0, + "step": 16330 + }, + { + "epoch": 1.793432901383703, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.233525276184082, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7015159130096436, + "num_tokens": 422259552.0, + "step": 16331 + }, + { + "epoch": 1.7935427190863167, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2737538814544678, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7055386304855347, + "num_tokens": 422289751.0, + "step": 16332 + }, + { + "epoch": 1.7936525367889304, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4080677032470703, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7197558879852295, + "num_tokens": 422314633.0, + "step": 16333 + }, + { + "epoch": 1.7937623544915442, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2001118659973145, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7184387445449829, + "num_tokens": 422341077.0, + "step": 16334 + }, + { + "epoch": 1.7938721721941577, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3798892498016357, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6877064108848572, + "num_tokens": 422369625.0, + "step": 16335 + }, + { + "epoch": 1.7939819898967713, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4639992713928223, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.714943528175354, + "num_tokens": 422394613.0, + "step": 16336 + }, + { + "epoch": 1.794091807599385, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.533607244491577, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7132784128189087, + "num_tokens": 422417982.0, + "step": 16337 + }, + { + "epoch": 1.7942016253019988, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.360440492630005, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7258078455924988, + "num_tokens": 422443533.0, + "step": 16338 + }, + { + "epoch": 1.7943114430046123, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4164562225341797, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6987721920013428, + "num_tokens": 422468024.0, + "step": 16339 + }, + { + "epoch": 1.7944212607072259, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5714805126190186, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.703499436378479, + "num_tokens": 422490614.0, + "step": 16340 + }, + { + "epoch": 1.7945310784098396, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6723711490631104, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7198866009712219, + "num_tokens": 422512313.0, + "step": 16341 + }, + { + "epoch": 1.7946408961124534, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.331364870071411, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7196022272109985, + "num_tokens": 422537722.0, + "step": 16342 + }, + { + "epoch": 1.7947507138150671, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0630061626434326, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7141015529632568, + "num_tokens": 422567382.0, + "step": 16343 + }, + { + "epoch": 1.7948605315176807, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.261972427368164, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6940783262252808, + "num_tokens": 422595383.0, + "step": 16344 + }, + { + "epoch": 1.7949703492202942, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.299889326095581, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7088931798934937, + "num_tokens": 422623755.0, + "step": 16345 + }, + { + "epoch": 1.795080166922908, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.383958578109741, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7041173577308655, + "num_tokens": 422649779.0, + "step": 16346 + }, + { + "epoch": 1.7951899846255217, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6016299724578857, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7185972332954407, + "num_tokens": 422671435.0, + "step": 16347 + }, + { + "epoch": 1.7952998023281352, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2733333110809326, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.73260098695755, + "num_tokens": 422696878.0, + "step": 16348 + }, + { + "epoch": 1.795409620030749, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4027903079986572, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7364038825035095, + "num_tokens": 422720433.0, + "step": 16349 + }, + { + "epoch": 1.7955194377333625, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5330371856689453, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6932494044303894, + "num_tokens": 422743653.0, + "step": 16350 + }, + { + "epoch": 1.7956292554359763, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.345737934112549, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7036118507385254, + "num_tokens": 422771067.0, + "step": 16351 + }, + { + "epoch": 1.79573907313859, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3313636779785156, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6908770799636841, + "num_tokens": 422798121.0, + "step": 16352 + }, + { + "epoch": 1.7958488908412036, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5457801818847656, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7047955989837646, + "num_tokens": 422821499.0, + "step": 16353 + }, + { + "epoch": 1.7959587085438171, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4269216060638428, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7263635993003845, + "num_tokens": 422843980.0, + "step": 16354 + }, + { + "epoch": 1.7960685262464309, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5221474170684814, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.716322660446167, + "num_tokens": 422866031.0, + "step": 16355 + }, + { + "epoch": 1.7961783439490446, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1666994094848633, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7122676372528076, + "num_tokens": 422900563.0, + "step": 16356 + }, + { + "epoch": 1.7962881616516584, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.335444688796997, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7056235074996948, + "num_tokens": 422926656.0, + "step": 16357 + }, + { + "epoch": 1.796397979354272, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.329303026199341, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7137617468833923, + "num_tokens": 422952485.0, + "step": 16358 + }, + { + "epoch": 1.7965077970568855, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1753573417663574, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6895527243614197, + "num_tokens": 422984735.0, + "step": 16359 + }, + { + "epoch": 1.7966176147594992, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5479719638824463, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7216356992721558, + "num_tokens": 423008382.0, + "step": 16360 + }, + { + "epoch": 1.796727432462113, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7041144371032715, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7265759706497192, + "num_tokens": 423028419.0, + "step": 16361 + }, + { + "epoch": 1.7968372501647265, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.416576862335205, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7127307653427124, + "num_tokens": 423053152.0, + "step": 16362 + }, + { + "epoch": 1.7969470678673403, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2871170043945312, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7260726094245911, + "num_tokens": 423079331.0, + "step": 16363 + }, + { + "epoch": 1.7970568855699538, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5192058086395264, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7050902843475342, + "num_tokens": 423102215.0, + "step": 16364 + }, + { + "epoch": 1.7971667032725676, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1610779762268066, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7399479150772095, + "num_tokens": 423131041.0, + "step": 16365 + }, + { + "epoch": 1.7972765209751813, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2985177040100098, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7203989028930664, + "num_tokens": 423158313.0, + "step": 16366 + }, + { + "epoch": 1.7973863386777948, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.751661777496338, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7139253616333008, + "num_tokens": 423177086.0, + "step": 16367 + }, + { + "epoch": 1.7974961563804084, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3461880683898926, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7121443748474121, + "num_tokens": 423202341.0, + "step": 16368 + }, + { + "epoch": 1.7976059740830221, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1974942684173584, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6962518692016602, + "num_tokens": 423232975.0, + "step": 16369 + }, + { + "epoch": 1.797715791785636, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.58730149269104, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.702570915222168, + "num_tokens": 423255271.0, + "step": 16370 + }, + { + "epoch": 1.7978256094882497, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.434577703475952, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7013571262359619, + "num_tokens": 423279840.0, + "step": 16371 + }, + { + "epoch": 1.7979354271908632, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4889333248138428, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7204326391220093, + "num_tokens": 423301839.0, + "step": 16372 + }, + { + "epoch": 1.7980452448934767, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4533541202545166, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7001304626464844, + "num_tokens": 423325332.0, + "step": 16373 + }, + { + "epoch": 1.7981550625960905, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2120275497436523, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7210423946380615, + "num_tokens": 423354366.0, + "step": 16374 + }, + { + "epoch": 1.7982648802987042, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5097124576568604, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6957782506942749, + "num_tokens": 423378460.0, + "step": 16375 + }, + { + "epoch": 1.7983746980013178, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.267023801803589, + "learning_rate": 1e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.695929765701294, + "num_tokens": 423409450.0, + "step": 16376 + }, + { + "epoch": 1.7984845157039313, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.554593324661255, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.7080191373825073, + "num_tokens": 423432078.0, + "step": 16377 + }, + { + "epoch": 1.798594333406545, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1691324710845947, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6890031099319458, + "num_tokens": 423463737.0, + "step": 16378 + }, + { + "epoch": 1.7987041511091588, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 4.344796657562256, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7262306809425354, + "num_tokens": 423490202.0, + "step": 16379 + }, + { + "epoch": 1.7988139688117726, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2877755165100098, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7187522649765015, + "num_tokens": 423518171.0, + "step": 16380 + }, + { + "epoch": 1.7989237865143861, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.19124436378479, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6970126628875732, + "num_tokens": 423549214.0, + "step": 16381 + }, + { + "epoch": 1.7990336042169996, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3080127239227295, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6835229396820068, + "num_tokens": 423583822.0, + "step": 16382 + }, + { + "epoch": 1.7991434219196134, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2139029502868652, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7108522653579712, + "num_tokens": 423611450.0, + "step": 16383 + }, + { + "epoch": 1.7992532396222272, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2541956901550293, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7176231145858765, + "num_tokens": 423639246.0, + "step": 16384 + }, + { + "epoch": 1.799363057324841, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4189670085906982, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7096084356307983, + "num_tokens": 423663237.0, + "step": 16385 + }, + { + "epoch": 1.7994728750274545, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 7.243786811828613, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.708581805229187, + "num_tokens": 423682575.0, + "step": 16386 + }, + { + "epoch": 1.799582692730068, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3134875297546387, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.7027513384819031, + "num_tokens": 423709095.0, + "step": 16387 + }, + { + "epoch": 1.7996925104326817, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.622676134109497, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7367610931396484, + "num_tokens": 423729133.0, + "step": 16388 + }, + { + "epoch": 1.7998023281352955, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6094963550567627, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7294048070907593, + "num_tokens": 423756251.0, + "step": 16389 + }, + { + "epoch": 1.799912145837909, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.46468186378479, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6955724358558655, + "num_tokens": 423780483.0, + "step": 16390 + }, + { + "epoch": 1.8000219635405226, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.30607008934021, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7117071151733398, + "num_tokens": 423807886.0, + "step": 16391 + }, + { + "epoch": 1.8001317812431363, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.494011640548706, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7278991937637329, + "num_tokens": 423830403.0, + "step": 16392 + }, + { + "epoch": 1.80024159894575, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1222281455993652, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.695224940776825, + "num_tokens": 423862554.0, + "step": 16393 + }, + { + "epoch": 1.8003514166483638, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3994028568267822, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7203013896942139, + "num_tokens": 423889404.0, + "step": 16394 + }, + { + "epoch": 1.8004612343509774, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1069176197052, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7049015760421753, + "num_tokens": 423920984.0, + "step": 16395 + }, + { + "epoch": 1.800571052053591, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5600814819335938, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6924305558204651, + "num_tokens": 423944673.0, + "step": 16396 + }, + { + "epoch": 1.8006808697562047, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2629177570343018, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7024455070495605, + "num_tokens": 423971785.0, + "step": 16397 + }, + { + "epoch": 1.8007906874588184, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7107694149017334, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7266842126846313, + "num_tokens": 423993324.0, + "step": 16398 + }, + { + "epoch": 1.8009005051614322, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2017037868499756, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7105050086975098, + "num_tokens": 424022594.0, + "step": 16399 + }, + { + "epoch": 1.8010103228640457, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3339624404907227, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7031793594360352, + "num_tokens": 424050715.0, + "step": 16400 + }, + { + "epoch": 1.8011201405666593, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.453164577484131, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6986035108566284, + "num_tokens": 424075604.0, + "step": 16401 + }, + { + "epoch": 1.801229958269273, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.8641984462738037, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7439324259757996, + "num_tokens": 424093160.0, + "step": 16402 + }, + { + "epoch": 1.8013397759718868, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2136595249176025, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6897538304328918, + "num_tokens": 424121916.0, + "step": 16403 + }, + { + "epoch": 1.8014495936745003, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.357630968093872, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7175070643424988, + "num_tokens": 424146915.0, + "step": 16404 + }, + { + "epoch": 1.8015594113771138, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.388348340988159, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6930014491081238, + "num_tokens": 424174791.0, + "step": 16405 + }, + { + "epoch": 1.8016692290797276, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1457650661468506, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7045587301254272, + "num_tokens": 424206433.0, + "step": 16406 + }, + { + "epoch": 1.8017790467823414, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.455861806869507, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7532713413238525, + "num_tokens": 424229767.0, + "step": 16407 + }, + { + "epoch": 1.801888864484955, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3406906127929688, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7108813524246216, + "num_tokens": 424258047.0, + "step": 16408 + }, + { + "epoch": 1.8019986821875686, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5194783210754395, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6957253813743591, + "num_tokens": 424281964.0, + "step": 16409 + }, + { + "epoch": 1.8021084998901822, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.270778179168701, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7107601761817932, + "num_tokens": 424309618.0, + "step": 16410 + }, + { + "epoch": 1.802218317592796, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3487555980682373, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.715641438961029, + "num_tokens": 424336400.0, + "step": 16411 + }, + { + "epoch": 1.8023281352954097, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.615731716156006, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7142828702926636, + "num_tokens": 424358957.0, + "step": 16412 + }, + { + "epoch": 1.8024379529980232, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0574679374694824, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7036917209625244, + "num_tokens": 424390698.0, + "step": 16413 + }, + { + "epoch": 1.802547770700637, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.803879499435425, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7248951196670532, + "num_tokens": 424408802.0, + "step": 16414 + }, + { + "epoch": 1.8026575884032505, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5861570835113525, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7127684354782104, + "num_tokens": 424432093.0, + "step": 16415 + }, + { + "epoch": 1.8027674061058643, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3741490840911865, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7121762037277222, + "num_tokens": 424457333.0, + "step": 16416 + }, + { + "epoch": 1.802877223808478, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.593101978302002, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7168400287628174, + "num_tokens": 424479372.0, + "step": 16417 + }, + { + "epoch": 1.8029870415110916, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.321650981903076, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6918489933013916, + "num_tokens": 424506414.0, + "step": 16418 + }, + { + "epoch": 1.803096859213705, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.542308807373047, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.687777042388916, + "num_tokens": 424530747.0, + "step": 16419 + }, + { + "epoch": 1.8032066769163189, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5997560024261475, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.710679292678833, + "num_tokens": 424554944.0, + "step": 16420 + }, + { + "epoch": 1.8033164946189326, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.378147840499878, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7418791055679321, + "num_tokens": 424577934.0, + "step": 16421 + }, + { + "epoch": 1.8034263123215464, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5644023418426514, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.701860785484314, + "num_tokens": 424600956.0, + "step": 16422 + }, + { + "epoch": 1.80353613002416, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3364481925964355, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7173568606376648, + "num_tokens": 424625842.0, + "step": 16423 + }, + { + "epoch": 1.8036459477267734, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3132574558258057, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7160677909851074, + "num_tokens": 424652210.0, + "step": 16424 + }, + { + "epoch": 1.8037557654293872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4861059188842773, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7213135957717896, + "num_tokens": 424676026.0, + "step": 16425 + }, + { + "epoch": 1.803865583132001, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.677842378616333, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7015502452850342, + "num_tokens": 424697535.0, + "step": 16426 + }, + { + "epoch": 1.8039754008346145, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3849217891693115, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7213957905769348, + "num_tokens": 424725156.0, + "step": 16427 + }, + { + "epoch": 1.8040852185372283, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7333240509033203, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7052991986274719, + "num_tokens": 424746333.0, + "step": 16428 + }, + { + "epoch": 1.8041950362398418, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4000542163848877, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7250178456306458, + "num_tokens": 424771557.0, + "step": 16429 + }, + { + "epoch": 1.8043048539424555, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.603595733642578, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7014955282211304, + "num_tokens": 424792970.0, + "step": 16430 + }, + { + "epoch": 1.8044146716450693, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6089205741882324, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7502583265304565, + "num_tokens": 424812348.0, + "step": 16431 + }, + { + "epoch": 1.8045244893476828, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2876269817352295, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6975818276405334, + "num_tokens": 424840825.0, + "step": 16432 + }, + { + "epoch": 1.8046343070502964, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6180670261383057, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7311990261077881, + "num_tokens": 424864673.0, + "step": 16433 + }, + { + "epoch": 1.8047441247529101, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5994884967803955, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7206765413284302, + "num_tokens": 424885996.0, + "step": 16434 + }, + { + "epoch": 1.8048539424555239, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.41780161857605, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7017161250114441, + "num_tokens": 424913553.0, + "step": 16435 + }, + { + "epoch": 1.8049637601581376, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4882681369781494, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7007757425308228, + "num_tokens": 424936463.0, + "step": 16436 + }, + { + "epoch": 1.8050735778607512, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1873629093170166, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6933085322380066, + "num_tokens": 424967622.0, + "step": 16437 + }, + { + "epoch": 1.8051833955633647, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3140640258789062, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7005316019058228, + "num_tokens": 424996346.0, + "step": 16438 + }, + { + "epoch": 1.8052932132659785, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7719614505767822, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7194778919219971, + "num_tokens": 425016697.0, + "step": 16439 + }, + { + "epoch": 1.8054030309685922, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5095584392547607, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7117191553115845, + "num_tokens": 425040663.0, + "step": 16440 + }, + { + "epoch": 1.8055128486712058, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3182477951049805, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7268967628479004, + "num_tokens": 425066619.0, + "step": 16441 + }, + { + "epoch": 1.8056226663738193, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2889668941497803, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7043079733848572, + "num_tokens": 425095651.0, + "step": 16442 + }, + { + "epoch": 1.805732484076433, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0669353008270264, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6957671642303467, + "num_tokens": 425129103.0, + "step": 16443 + }, + { + "epoch": 1.8058423017790468, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3045506477355957, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7115145921707153, + "num_tokens": 425158032.0, + "step": 16444 + }, + { + "epoch": 1.8059521194816606, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6214046478271484, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7073314785957336, + "num_tokens": 425179784.0, + "step": 16445 + }, + { + "epoch": 1.806061937184274, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5950024127960205, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7258070707321167, + "num_tokens": 425204646.0, + "step": 16446 + }, + { + "epoch": 1.8061717548868876, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7015507221221924, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6918543577194214, + "num_tokens": 425230154.0, + "step": 16447 + }, + { + "epoch": 1.8062815725895014, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.451902151107788, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.721144437789917, + "num_tokens": 425252949.0, + "step": 16448 + }, + { + "epoch": 1.8063913902921152, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.375828266143799, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7093433141708374, + "num_tokens": 425279039.0, + "step": 16449 + }, + { + "epoch": 1.806501207994729, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.9573224782943726, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7043948173522949, + "num_tokens": 425314120.0, + "step": 16450 + }, + { + "epoch": 1.8066110256973424, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3374340534210205, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7114838361740112, + "num_tokens": 425341235.0, + "step": 16451 + }, + { + "epoch": 1.806720843399956, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5206212997436523, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7345852851867676, + "num_tokens": 425362936.0, + "step": 16452 + }, + { + "epoch": 1.8068306611025697, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.392122268676758, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7047038674354553, + "num_tokens": 425387980.0, + "step": 16453 + }, + { + "epoch": 1.8069404788051835, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3490326404571533, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7204655408859253, + "num_tokens": 425415000.0, + "step": 16454 + }, + { + "epoch": 1.807050296507797, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.516098737716675, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7152618765830994, + "num_tokens": 425438014.0, + "step": 16455 + }, + { + "epoch": 1.8071601142104106, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2552154064178467, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7002637386322021, + "num_tokens": 425465809.0, + "step": 16456 + }, + { + "epoch": 1.8072699319130243, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.260239601135254, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.696549117565155, + "num_tokens": 425494739.0, + "step": 16457 + }, + { + "epoch": 1.807379749615638, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4391674995422363, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7095667719841003, + "num_tokens": 425519222.0, + "step": 16458 + }, + { + "epoch": 1.8074895673182518, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.358675479888916, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.6913798451423645, + "num_tokens": 425546860.0, + "step": 16459 + }, + { + "epoch": 1.8075993850208654, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.172154188156128, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6814929246902466, + "num_tokens": 425578098.0, + "step": 16460 + }, + { + "epoch": 1.807709202723479, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.410832643508911, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.703274667263031, + "num_tokens": 425603109.0, + "step": 16461 + }, + { + "epoch": 1.8078190204260927, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6293933391571045, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6943476796150208, + "num_tokens": 425624711.0, + "step": 16462 + }, + { + "epoch": 1.8079288381287064, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.317153215408325, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7044408321380615, + "num_tokens": 425651333.0, + "step": 16463 + }, + { + "epoch": 1.8080386558313202, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2814948558807373, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7090430855751038, + "num_tokens": 425681369.0, + "step": 16464 + }, + { + "epoch": 1.8081484735339337, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.480886936187744, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7412084937095642, + "num_tokens": 425705748.0, + "step": 16465 + }, + { + "epoch": 1.8082582912365472, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2736852169036865, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7392462491989136, + "num_tokens": 425730636.0, + "step": 16466 + }, + { + "epoch": 1.808368108939161, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3119685649871826, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6993179321289062, + "num_tokens": 425758932.0, + "step": 16467 + }, + { + "epoch": 1.8084779266417748, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7109649181365967, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7180348038673401, + "num_tokens": 425778446.0, + "step": 16468 + }, + { + "epoch": 1.8085877443443883, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.206927537918091, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7211487293243408, + "num_tokens": 425807780.0, + "step": 16469 + }, + { + "epoch": 1.8086975620470018, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3532967567443848, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6892572641372681, + "num_tokens": 425834352.0, + "step": 16470 + }, + { + "epoch": 1.8088073797496156, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.209439992904663, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6992412805557251, + "num_tokens": 425864174.0, + "step": 16471 + }, + { + "epoch": 1.8089171974522293, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.230149507522583, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7181103229522705, + "num_tokens": 425894335.0, + "step": 16472 + }, + { + "epoch": 1.809027015154843, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3524739742279053, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6988033056259155, + "num_tokens": 425925002.0, + "step": 16473 + }, + { + "epoch": 1.8091368328574566, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4295740127563477, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.70036780834198, + "num_tokens": 425950776.0, + "step": 16474 + }, + { + "epoch": 1.8092466505600702, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.284813642501831, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7025064826011658, + "num_tokens": 425977702.0, + "step": 16475 + }, + { + "epoch": 1.809356468262684, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.602914571762085, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7313138842582703, + "num_tokens": 425999777.0, + "step": 16476 + }, + { + "epoch": 1.8094662859652977, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.500011920928955, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.697400689125061, + "num_tokens": 426023601.0, + "step": 16477 + }, + { + "epoch": 1.8095761036679112, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2849040031433105, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7141345143318176, + "num_tokens": 426052195.0, + "step": 16478 + }, + { + "epoch": 1.809685921370525, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4422173500061035, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.709530770778656, + "num_tokens": 426076543.0, + "step": 16479 + }, + { + "epoch": 1.8097957390731385, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.437622547149658, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6993100643157959, + "num_tokens": 426102646.0, + "step": 16480 + }, + { + "epoch": 1.8099055567757523, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3540141582489014, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7184714078903198, + "num_tokens": 426128498.0, + "step": 16481 + }, + { + "epoch": 1.810015374478366, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.477198362350464, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7123618125915527, + "num_tokens": 426152242.0, + "step": 16482 + }, + { + "epoch": 1.8101251921809796, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.270068883895874, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7093847990036011, + "num_tokens": 426179491.0, + "step": 16483 + }, + { + "epoch": 1.810235009883593, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2761621475219727, + "learning_rate": 1e-06, + "loss": 1.1189, + "mean_token_accuracy": 0.6757925748825073, + "num_tokens": 426210144.0, + "step": 16484 + }, + { + "epoch": 1.8103448275862069, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.492849111557007, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6957730650901794, + "num_tokens": 426240115.0, + "step": 16485 + }, + { + "epoch": 1.8104546452888206, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4348206520080566, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7097434997558594, + "num_tokens": 426264571.0, + "step": 16486 + }, + { + "epoch": 1.8105644629914344, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0981533527374268, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6883838176727295, + "num_tokens": 426299083.0, + "step": 16487 + }, + { + "epoch": 1.810674280694048, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.318852424621582, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7071152925491333, + "num_tokens": 426324702.0, + "step": 16488 + }, + { + "epoch": 1.8107840983966614, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.573956251144409, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7218515276908875, + "num_tokens": 426346607.0, + "step": 16489 + }, + { + "epoch": 1.8108939160992752, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3702588081359863, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6963951587677002, + "num_tokens": 426370961.0, + "step": 16490 + }, + { + "epoch": 1.811003733801889, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2984132766723633, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7128102779388428, + "num_tokens": 426401716.0, + "step": 16491 + }, + { + "epoch": 1.8111135515045025, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6794471740722656, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7048957347869873, + "num_tokens": 426423232.0, + "step": 16492 + }, + { + "epoch": 1.8112233692071162, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7389001846313477, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7109878063201904, + "num_tokens": 426444908.0, + "step": 16493 + }, + { + "epoch": 1.8113331869097298, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.768692970275879, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.737068235874176, + "num_tokens": 426464873.0, + "step": 16494 + }, + { + "epoch": 1.8114430046123435, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1948890686035156, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7041999101638794, + "num_tokens": 426493448.0, + "step": 16495 + }, + { + "epoch": 1.8115528223149573, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6756274700164795, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7127425670623779, + "num_tokens": 426514810.0, + "step": 16496 + }, + { + "epoch": 1.8116626400175708, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6752331256866455, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.729040265083313, + "num_tokens": 426535324.0, + "step": 16497 + }, + { + "epoch": 1.8117724577201844, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.336696147918701, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7130510807037354, + "num_tokens": 426563419.0, + "step": 16498 + }, + { + "epoch": 1.8118822754227981, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2961621284484863, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7198246121406555, + "num_tokens": 426589380.0, + "step": 16499 + }, + { + "epoch": 1.8119920931254119, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5867996215820312, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7168883681297302, + "num_tokens": 426613698.0, + "step": 16500 + }, + { + "epoch": 1.8121019108280256, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0055346488952637, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6821060180664062, + "num_tokens": 426652593.0, + "step": 16501 + }, + { + "epoch": 1.8122117285306392, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.241773843765259, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6949576139450073, + "num_tokens": 426682565.0, + "step": 16502 + }, + { + "epoch": 1.8123215462332527, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.034334421157837, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7045605182647705, + "num_tokens": 426718171.0, + "step": 16503 + }, + { + "epoch": 1.8124313639358665, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.037281036376953, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7211054563522339, + "num_tokens": 426751561.0, + "step": 16504 + }, + { + "epoch": 1.8125411816384802, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3578569889068604, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6931889057159424, + "num_tokens": 426776114.0, + "step": 16505 + }, + { + "epoch": 1.8126509993410937, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1236228942871094, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.701884925365448, + "num_tokens": 426805410.0, + "step": 16506 + }, + { + "epoch": 1.8127608170437073, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3932437896728516, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6913325786590576, + "num_tokens": 426831192.0, + "step": 16507 + }, + { + "epoch": 1.812870634746321, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.359388828277588, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6923500299453735, + "num_tokens": 426858715.0, + "step": 16508 + }, + { + "epoch": 1.8129804524489348, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.314755439758301, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6888712644577026, + "num_tokens": 426888925.0, + "step": 16509 + }, + { + "epoch": 1.8130902701515486, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.505134105682373, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7129914164543152, + "num_tokens": 426911423.0, + "step": 16510 + }, + { + "epoch": 1.813200087854162, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.562987804412842, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6932888031005859, + "num_tokens": 426935761.0, + "step": 16511 + }, + { + "epoch": 1.8133099055567756, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.260491132736206, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7182110548019409, + "num_tokens": 426963565.0, + "step": 16512 + }, + { + "epoch": 1.8134197232593894, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3162286281585693, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7273924350738525, + "num_tokens": 426988193.0, + "step": 16513 + }, + { + "epoch": 1.8135295409620031, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.9950411319732666, + "learning_rate": 1e-06, + "loss": 1.1265, + "mean_token_accuracy": 0.6779450178146362, + "num_tokens": 427022948.0, + "step": 16514 + }, + { + "epoch": 1.813639358664617, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2676539421081543, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7109031677246094, + "num_tokens": 427052540.0, + "step": 16515 + }, + { + "epoch": 1.8137491763672304, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.46293568611145, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7090135812759399, + "num_tokens": 427075944.0, + "step": 16516 + }, + { + "epoch": 1.813858994069844, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.0394153594970703, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.715079665184021, + "num_tokens": 427108854.0, + "step": 16517 + }, + { + "epoch": 1.8139688117724577, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5877225399017334, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.707743763923645, + "num_tokens": 427131274.0, + "step": 16518 + }, + { + "epoch": 1.8140786294750715, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.467284679412842, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7056078910827637, + "num_tokens": 427155856.0, + "step": 16519 + }, + { + "epoch": 1.814188447177685, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.319002389907837, + "learning_rate": 1e-06, + "loss": 1.1266, + "mean_token_accuracy": 0.674774169921875, + "num_tokens": 427185417.0, + "step": 16520 + }, + { + "epoch": 1.8142982648802986, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.424856185913086, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6873064041137695, + "num_tokens": 427212866.0, + "step": 16521 + }, + { + "epoch": 1.8144080825829123, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.259395122528076, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6974337697029114, + "num_tokens": 427238942.0, + "step": 16522 + }, + { + "epoch": 1.814517900285526, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3282275199890137, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7121416330337524, + "num_tokens": 427264887.0, + "step": 16523 + }, + { + "epoch": 1.8146277179881398, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.211561441421509, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7252030372619629, + "num_tokens": 427292572.0, + "step": 16524 + }, + { + "epoch": 1.8147375356907534, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5510964393615723, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6995256543159485, + "num_tokens": 427316787.0, + "step": 16525 + }, + { + "epoch": 1.814847353393367, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2893033027648926, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6831722259521484, + "num_tokens": 427348035.0, + "step": 16526 + }, + { + "epoch": 1.8149571710959806, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3276426792144775, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7126997709274292, + "num_tokens": 427373501.0, + "step": 16527 + }, + { + "epoch": 1.8150669887985944, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.8168723583221436, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7237692475318909, + "num_tokens": 427392578.0, + "step": 16528 + }, + { + "epoch": 1.815176806501208, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2908504009246826, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7107079029083252, + "num_tokens": 427420076.0, + "step": 16529 + }, + { + "epoch": 1.8152866242038217, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.486945152282715, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6967741250991821, + "num_tokens": 427447000.0, + "step": 16530 + }, + { + "epoch": 1.8153964419064352, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6723692417144775, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7027685046195984, + "num_tokens": 427467724.0, + "step": 16531 + }, + { + "epoch": 1.815506259609049, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.148806571960449, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7211926579475403, + "num_tokens": 427498695.0, + "step": 16532 + }, + { + "epoch": 1.8156160773116627, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.373194694519043, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7109270095825195, + "num_tokens": 427524312.0, + "step": 16533 + }, + { + "epoch": 1.8157258950142763, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4145522117614746, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6836552619934082, + "num_tokens": 427552420.0, + "step": 16534 + }, + { + "epoch": 1.8158357127168898, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.387540578842163, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.714735746383667, + "num_tokens": 427577265.0, + "step": 16535 + }, + { + "epoch": 1.8159455304195036, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1148178577423096, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7210721373558044, + "num_tokens": 427607241.0, + "step": 16536 + }, + { + "epoch": 1.8160553481221173, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6682655811309814, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7177261114120483, + "num_tokens": 427626819.0, + "step": 16537 + }, + { + "epoch": 1.816165165824731, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.234942674636841, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7190164923667908, + "num_tokens": 427656171.0, + "step": 16538 + }, + { + "epoch": 1.8162749835273446, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2824578285217285, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6914443373680115, + "num_tokens": 427683797.0, + "step": 16539 + }, + { + "epoch": 1.8163848012299582, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1963374614715576, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7301574349403381, + "num_tokens": 427710480.0, + "step": 16540 + }, + { + "epoch": 1.816494618932572, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.300785541534424, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.71309494972229, + "num_tokens": 427736371.0, + "step": 16541 + }, + { + "epoch": 1.8166044366351857, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.511544704437256, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6965786814689636, + "num_tokens": 427758803.0, + "step": 16542 + }, + { + "epoch": 1.8167142543377992, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.616818428039551, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7200456857681274, + "num_tokens": 427779095.0, + "step": 16543 + }, + { + "epoch": 1.816824072040413, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6415531635284424, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6996907591819763, + "num_tokens": 427799114.0, + "step": 16544 + }, + { + "epoch": 1.8169338897430265, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.428673267364502, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6852748394012451, + "num_tokens": 427826069.0, + "step": 16545 + }, + { + "epoch": 1.8170437074456403, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5473101139068604, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7119296789169312, + "num_tokens": 427849250.0, + "step": 16546 + }, + { + "epoch": 1.817153525148254, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.247169017791748, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.698402464389801, + "num_tokens": 427880158.0, + "step": 16547 + }, + { + "epoch": 1.8172633428508675, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1852331161499023, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.714427649974823, + "num_tokens": 427911055.0, + "step": 16548 + }, + { + "epoch": 1.817373160553481, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.286367177963257, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6925952434539795, + "num_tokens": 427941798.0, + "step": 16549 + }, + { + "epoch": 1.8174829782560948, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.45116925239563, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7210358381271362, + "num_tokens": 427965018.0, + "step": 16550 + }, + { + "epoch": 1.8175927959587086, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6392345428466797, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7100579738616943, + "num_tokens": 427986831.0, + "step": 16551 + }, + { + "epoch": 1.8177026136613224, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3664722442626953, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7307754755020142, + "num_tokens": 428011835.0, + "step": 16552 + }, + { + "epoch": 1.8178124313639359, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.980128049850464, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7179924249649048, + "num_tokens": 428030219.0, + "step": 16553 + }, + { + "epoch": 1.8179222490665494, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 4.023497581481934, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7132141590118408, + "num_tokens": 428050693.0, + "step": 16554 + }, + { + "epoch": 1.8180320667691632, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5107638835906982, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7179355025291443, + "num_tokens": 428075890.0, + "step": 16555 + }, + { + "epoch": 1.818141884471777, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3443686962127686, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7015386819839478, + "num_tokens": 428104492.0, + "step": 16556 + }, + { + "epoch": 1.8182517021743905, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6991615295410156, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7146986722946167, + "num_tokens": 428125142.0, + "step": 16557 + }, + { + "epoch": 1.818361519877004, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4154860973358154, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7070701718330383, + "num_tokens": 428149052.0, + "step": 16558 + }, + { + "epoch": 1.8184713375796178, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.314476728439331, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7381964325904846, + "num_tokens": 428175138.0, + "step": 16559 + }, + { + "epoch": 1.8185811552822315, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.341493844985962, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7124483585357666, + "num_tokens": 428201616.0, + "step": 16560 + }, + { + "epoch": 1.8186909729848453, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2062857151031494, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7170103788375854, + "num_tokens": 428229204.0, + "step": 16561 + }, + { + "epoch": 1.8188007906874588, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.506693124771118, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.708006739616394, + "num_tokens": 428252398.0, + "step": 16562 + }, + { + "epoch": 1.8189106083900723, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.23264217376709, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.727034330368042, + "num_tokens": 428280091.0, + "step": 16563 + }, + { + "epoch": 1.819020426092686, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.677124500274658, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7324234247207642, + "num_tokens": 428303463.0, + "step": 16564 + }, + { + "epoch": 1.8191302437952999, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.45211124420166, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7135688662528992, + "num_tokens": 428326495.0, + "step": 16565 + }, + { + "epoch": 1.8192400614979136, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3450639247894287, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.689588725566864, + "num_tokens": 428356987.0, + "step": 16566 + }, + { + "epoch": 1.8193498792005272, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5238516330718994, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7034299969673157, + "num_tokens": 428382039.0, + "step": 16567 + }, + { + "epoch": 1.8194596969031407, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.330822229385376, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7147039175033569, + "num_tokens": 428407816.0, + "step": 16568 + }, + { + "epoch": 1.8195695146057544, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.311582326889038, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.709721565246582, + "num_tokens": 428434878.0, + "step": 16569 + }, + { + "epoch": 1.8196793323083682, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.361309051513672, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7042196989059448, + "num_tokens": 428461640.0, + "step": 16570 + }, + { + "epoch": 1.8197891500109817, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4096291065216064, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7062384486198425, + "num_tokens": 428486922.0, + "step": 16571 + }, + { + "epoch": 1.8198989677135953, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.56050181388855, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7407269477844238, + "num_tokens": 428509178.0, + "step": 16572 + }, + { + "epoch": 1.820008785416209, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 3.1435537338256836, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7225474119186401, + "num_tokens": 428526688.0, + "step": 16573 + }, + { + "epoch": 1.8201186031188228, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.417290210723877, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6980963945388794, + "num_tokens": 428552595.0, + "step": 16574 + }, + { + "epoch": 1.8202284208214365, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3745131492614746, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7209354043006897, + "num_tokens": 428580590.0, + "step": 16575 + }, + { + "epoch": 1.82033823852405, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1805968284606934, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7216688990592957, + "num_tokens": 428610668.0, + "step": 16576 + }, + { + "epoch": 1.8204480562266636, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.630211353302002, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7209681272506714, + "num_tokens": 428630137.0, + "step": 16577 + }, + { + "epoch": 1.8205578739292774, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3680260181427, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7237536907196045, + "num_tokens": 428656268.0, + "step": 16578 + }, + { + "epoch": 1.8206676916318911, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.311854124069214, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7183772325515747, + "num_tokens": 428684183.0, + "step": 16579 + }, + { + "epoch": 1.8207775093345049, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.340850591659546, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7304718494415283, + "num_tokens": 428708305.0, + "step": 16580 + }, + { + "epoch": 1.8208873270371184, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4194388389587402, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7198225259780884, + "num_tokens": 428732776.0, + "step": 16581 + }, + { + "epoch": 1.820997144739732, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.143873929977417, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7042398452758789, + "num_tokens": 428764808.0, + "step": 16582 + }, + { + "epoch": 1.8211069624423457, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.733480930328369, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7080519199371338, + "num_tokens": 428784878.0, + "step": 16583 + }, + { + "epoch": 1.8212167801449595, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.20298171043396, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7306907176971436, + "num_tokens": 428815897.0, + "step": 16584 + }, + { + "epoch": 1.821326597847573, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.707892894744873, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.723206102848053, + "num_tokens": 428837386.0, + "step": 16585 + }, + { + "epoch": 1.8214364155501865, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2165944576263428, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7184594869613647, + "num_tokens": 428863800.0, + "step": 16586 + }, + { + "epoch": 1.8215462332528003, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.671226978302002, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7430055141448975, + "num_tokens": 428884082.0, + "step": 16587 + }, + { + "epoch": 1.821656050955414, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6600074768066406, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7327509522438049, + "num_tokens": 428904607.0, + "step": 16588 + }, + { + "epoch": 1.8217658686580278, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.360598087310791, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7072033882141113, + "num_tokens": 428930390.0, + "step": 16589 + }, + { + "epoch": 1.8218756863606413, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4786124229431152, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7097277641296387, + "num_tokens": 428955621.0, + "step": 16590 + }, + { + "epoch": 1.8219855040632549, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2153396606445312, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7282600402832031, + "num_tokens": 428986531.0, + "step": 16591 + }, + { + "epoch": 1.8220953217658686, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4387385845184326, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7256304025650024, + "num_tokens": 429011547.0, + "step": 16592 + }, + { + "epoch": 1.8222051394684824, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3586103916168213, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7113323211669922, + "num_tokens": 429037503.0, + "step": 16593 + }, + { + "epoch": 1.822314957171096, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.484175443649292, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.704156756401062, + "num_tokens": 429063438.0, + "step": 16594 + }, + { + "epoch": 1.8224247748737097, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3380579948425293, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7176698446273804, + "num_tokens": 429089960.0, + "step": 16595 + }, + { + "epoch": 1.8225345925763232, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4712133407592773, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7098054885864258, + "num_tokens": 429112940.0, + "step": 16596 + }, + { + "epoch": 1.822644410278937, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2379586696624756, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.731626033782959, + "num_tokens": 429139904.0, + "step": 16597 + }, + { + "epoch": 1.8227542279815507, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2151427268981934, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7040169835090637, + "num_tokens": 429172666.0, + "step": 16598 + }, + { + "epoch": 1.8228640456841643, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6035914421081543, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7174754738807678, + "num_tokens": 429195114.0, + "step": 16599 + }, + { + "epoch": 1.8229738633867778, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.350438356399536, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6918091177940369, + "num_tokens": 429222275.0, + "step": 16600 + }, + { + "epoch": 1.8230836810893916, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4578938484191895, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6940693855285645, + "num_tokens": 429248468.0, + "step": 16601 + }, + { + "epoch": 1.8231934987920053, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2756402492523193, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.703458309173584, + "num_tokens": 429276517.0, + "step": 16602 + }, + { + "epoch": 1.823303316494619, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1790072917938232, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7011233568191528, + "num_tokens": 429305965.0, + "step": 16603 + }, + { + "epoch": 1.8234131341972326, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3837311267852783, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7046952247619629, + "num_tokens": 429329134.0, + "step": 16604 + }, + { + "epoch": 1.8235229518998461, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2219038009643555, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.700941801071167, + "num_tokens": 429358836.0, + "step": 16605 + }, + { + "epoch": 1.82363276960246, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5908820629119873, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7277956008911133, + "num_tokens": 429380611.0, + "step": 16606 + }, + { + "epoch": 1.8237425873050737, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3954856395721436, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.693587601184845, + "num_tokens": 429407180.0, + "step": 16607 + }, + { + "epoch": 1.8238524050076872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3490958213806152, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6864091753959656, + "num_tokens": 429435290.0, + "step": 16608 + }, + { + "epoch": 1.823962222710301, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.472254514694214, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7096537351608276, + "num_tokens": 429460877.0, + "step": 16609 + }, + { + "epoch": 1.8240720404129145, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2726142406463623, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7146544456481934, + "num_tokens": 429490257.0, + "step": 16610 + }, + { + "epoch": 1.8241818581155282, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2552781105041504, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6909981966018677, + "num_tokens": 429518656.0, + "step": 16611 + }, + { + "epoch": 1.824291675818142, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.115192413330078, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6833907961845398, + "num_tokens": 429551292.0, + "step": 16612 + }, + { + "epoch": 1.8244014935207555, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.149125576019287, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6911933422088623, + "num_tokens": 429581395.0, + "step": 16613 + }, + { + "epoch": 1.824511311223369, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3730180263519287, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7160760164260864, + "num_tokens": 429610545.0, + "step": 16614 + }, + { + "epoch": 1.8246211289259828, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1749343872070312, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6941317319869995, + "num_tokens": 429640094.0, + "step": 16615 + }, + { + "epoch": 1.8247309466285966, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.372653007507324, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6860779523849487, + "num_tokens": 429664718.0, + "step": 16616 + }, + { + "epoch": 1.8248407643312103, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.224478244781494, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7127286791801453, + "num_tokens": 429693462.0, + "step": 16617 + }, + { + "epoch": 1.8249505820338239, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3669888973236084, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.685312807559967, + "num_tokens": 429722076.0, + "step": 16618 + }, + { + "epoch": 1.8250603997364374, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2383852005004883, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6957365274429321, + "num_tokens": 429751375.0, + "step": 16619 + }, + { + "epoch": 1.8251702174390512, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.570981502532959, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.712126612663269, + "num_tokens": 429773201.0, + "step": 16620 + }, + { + "epoch": 1.825280035141665, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2245430946350098, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7348231077194214, + "num_tokens": 429801196.0, + "step": 16621 + }, + { + "epoch": 1.8253898528442785, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.485368251800537, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7128790616989136, + "num_tokens": 429823930.0, + "step": 16622 + }, + { + "epoch": 1.825499670546892, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3234939575195312, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7044165134429932, + "num_tokens": 429852022.0, + "step": 16623 + }, + { + "epoch": 1.8256094882495058, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2275376319885254, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6901025772094727, + "num_tokens": 429881626.0, + "step": 16624 + }, + { + "epoch": 1.8257193059521195, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.247670888900757, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7089911103248596, + "num_tokens": 429910784.0, + "step": 16625 + }, + { + "epoch": 1.8258291236547333, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.169166326522827, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7131901979446411, + "num_tokens": 429940358.0, + "step": 16626 + }, + { + "epoch": 1.8259389413573468, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2189714908599854, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.689000129699707, + "num_tokens": 429969579.0, + "step": 16627 + }, + { + "epoch": 1.8260487590599603, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4280223846435547, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.7003831267356873, + "num_tokens": 429996881.0, + "step": 16628 + }, + { + "epoch": 1.826158576762574, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3601207733154297, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7163417339324951, + "num_tokens": 430022276.0, + "step": 16629 + }, + { + "epoch": 1.8262683944651878, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4265828132629395, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7149901986122131, + "num_tokens": 430047514.0, + "step": 16630 + }, + { + "epoch": 1.8263782121678016, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5852270126342773, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.708938479423523, + "num_tokens": 430068455.0, + "step": 16631 + }, + { + "epoch": 1.8264880298704151, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6142423152923584, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.707852303981781, + "num_tokens": 430091890.0, + "step": 16632 + }, + { + "epoch": 1.8265978475730287, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2313578128814697, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6815783977508545, + "num_tokens": 430122910.0, + "step": 16633 + }, + { + "epoch": 1.8267076652756424, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.421628952026367, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.7024475932121277, + "num_tokens": 430148544.0, + "step": 16634 + }, + { + "epoch": 1.8268174829782562, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3292322158813477, + "learning_rate": 1e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6884216666221619, + "num_tokens": 430177519.0, + "step": 16635 + }, + { + "epoch": 1.8269273006808697, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.449028253555298, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7089278697967529, + "num_tokens": 430202491.0, + "step": 16636 + }, + { + "epoch": 1.8270371183834833, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.384629249572754, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7173507213592529, + "num_tokens": 430228359.0, + "step": 16637 + }, + { + "epoch": 1.827146936086097, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.8278300762176514, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7142258882522583, + "num_tokens": 430246665.0, + "step": 16638 + }, + { + "epoch": 1.8272567537887108, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4322824478149414, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7146245241165161, + "num_tokens": 430271257.0, + "step": 16639 + }, + { + "epoch": 1.8273665714913245, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3565456867218018, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7363892793655396, + "num_tokens": 430295702.0, + "step": 16640 + }, + { + "epoch": 1.827476389193938, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4113576412200928, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7183688282966614, + "num_tokens": 430319395.0, + "step": 16641 + }, + { + "epoch": 1.8275862068965516, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6379051208496094, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.7062130570411682, + "num_tokens": 430344876.0, + "step": 16642 + }, + { + "epoch": 1.8276960245991654, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.300797700881958, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.728095293045044, + "num_tokens": 430369863.0, + "step": 16643 + }, + { + "epoch": 1.8278058423017791, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.438711166381836, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.702835738658905, + "num_tokens": 430394954.0, + "step": 16644 + }, + { + "epoch": 1.8279156600043929, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1280019283294678, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6865796446800232, + "num_tokens": 430426202.0, + "step": 16645 + }, + { + "epoch": 1.8280254777070064, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.406040668487549, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6984503269195557, + "num_tokens": 430452728.0, + "step": 16646 + }, + { + "epoch": 1.82813529540962, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4422447681427, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7127236127853394, + "num_tokens": 430476884.0, + "step": 16647 + }, + { + "epoch": 1.8282451131122337, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3834619522094727, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7132749557495117, + "num_tokens": 430501811.0, + "step": 16648 + }, + { + "epoch": 1.8283549308148475, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4358627796173096, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7245887517929077, + "num_tokens": 430525781.0, + "step": 16649 + }, + { + "epoch": 1.828464748517461, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1291840076446533, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7188103795051575, + "num_tokens": 430556236.0, + "step": 16650 + }, + { + "epoch": 1.8285745662200745, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.869191884994507, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7251336574554443, + "num_tokens": 430575557.0, + "step": 16651 + }, + { + "epoch": 1.8286843839226883, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.21932053565979, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7306302189826965, + "num_tokens": 430602998.0, + "step": 16652 + }, + { + "epoch": 1.828794201625302, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.473590135574341, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7161658406257629, + "num_tokens": 430626485.0, + "step": 16653 + }, + { + "epoch": 1.8289040193279158, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.377856492996216, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7137720584869385, + "num_tokens": 430651993.0, + "step": 16654 + }, + { + "epoch": 1.8290138370305293, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.331068277359009, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7002565860748291, + "num_tokens": 430677563.0, + "step": 16655 + }, + { + "epoch": 1.8291236547331429, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2382237911224365, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6907050609588623, + "num_tokens": 430705592.0, + "step": 16656 + }, + { + "epoch": 1.8292334724357566, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.8753952980041504, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.7099834680557251, + "num_tokens": 430725170.0, + "step": 16657 + }, + { + "epoch": 1.8293432901383704, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3314452171325684, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6961337327957153, + "num_tokens": 430751860.0, + "step": 16658 + }, + { + "epoch": 1.829453107840984, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2836110591888428, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6981104612350464, + "num_tokens": 430781725.0, + "step": 16659 + }, + { + "epoch": 1.8295629255435977, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2533974647521973, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7335838079452515, + "num_tokens": 430807699.0, + "step": 16660 + }, + { + "epoch": 1.8296727432462112, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.386366605758667, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7179931998252869, + "num_tokens": 430832221.0, + "step": 16661 + }, + { + "epoch": 1.829782560948825, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4467198848724365, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7125098705291748, + "num_tokens": 430856288.0, + "step": 16662 + }, + { + "epoch": 1.8298923786514387, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4733986854553223, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7051825523376465, + "num_tokens": 430878692.0, + "step": 16663 + }, + { + "epoch": 1.8300021963540523, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.349713087081909, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7252767086029053, + "num_tokens": 430902390.0, + "step": 16664 + }, + { + "epoch": 1.8301120140566658, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3304364681243896, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7053447365760803, + "num_tokens": 430927656.0, + "step": 16665 + }, + { + "epoch": 1.8302218317592795, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2757742404937744, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7143754959106445, + "num_tokens": 430955307.0, + "step": 16666 + }, + { + "epoch": 1.8303316494618933, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2158961296081543, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7091112732887268, + "num_tokens": 430984328.0, + "step": 16667 + }, + { + "epoch": 1.830441467164507, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0097954273223877, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6968334913253784, + "num_tokens": 431019994.0, + "step": 16668 + }, + { + "epoch": 1.8305512848671206, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.47870135307312, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7271242737770081, + "num_tokens": 431042177.0, + "step": 16669 + }, + { + "epoch": 1.8306611025697341, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.28131365776062, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7088183164596558, + "num_tokens": 431068939.0, + "step": 16670 + }, + { + "epoch": 1.830770920272348, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.590129852294922, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6980355978012085, + "num_tokens": 431089876.0, + "step": 16671 + }, + { + "epoch": 1.8308807379749616, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3741490840911865, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.687646746635437, + "num_tokens": 431114712.0, + "step": 16672 + }, + { + "epoch": 1.8309905556775752, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.487138271331787, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7231653332710266, + "num_tokens": 431136945.0, + "step": 16673 + }, + { + "epoch": 1.831100373380189, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5502078533172607, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7098675966262817, + "num_tokens": 431159197.0, + "step": 16674 + }, + { + "epoch": 1.8312101910828025, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.450028419494629, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6999629735946655, + "num_tokens": 431183436.0, + "step": 16675 + }, + { + "epoch": 1.8313200087854162, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.333134889602661, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7093073129653931, + "num_tokens": 431208000.0, + "step": 16676 + }, + { + "epoch": 1.83142982648803, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6248531341552734, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7069152593612671, + "num_tokens": 431229066.0, + "step": 16677 + }, + { + "epoch": 1.8315396441906435, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.516784191131592, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7236747741699219, + "num_tokens": 431253228.0, + "step": 16678 + }, + { + "epoch": 1.831649461893257, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.388772487640381, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7049685120582581, + "num_tokens": 431277606.0, + "step": 16679 + }, + { + "epoch": 1.8317592795958708, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2191264629364014, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6898937225341797, + "num_tokens": 431306796.0, + "step": 16680 + }, + { + "epoch": 1.8318690972984846, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3100600242614746, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7040854692459106, + "num_tokens": 431332376.0, + "step": 16681 + }, + { + "epoch": 1.8319789150010983, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.092977523803711, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7144758105278015, + "num_tokens": 431361858.0, + "step": 16682 + }, + { + "epoch": 1.8320887327037119, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4307827949523926, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7126079797744751, + "num_tokens": 431385909.0, + "step": 16683 + }, + { + "epoch": 1.8321985504063254, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3583009243011475, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6895784139633179, + "num_tokens": 431411587.0, + "step": 16684 + }, + { + "epoch": 1.8323083681089392, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2921173572540283, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7086291313171387, + "num_tokens": 431438484.0, + "step": 16685 + }, + { + "epoch": 1.832418185811553, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4409565925598145, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.692116379737854, + "num_tokens": 431463481.0, + "step": 16686 + }, + { + "epoch": 1.8325280035141664, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3904712200164795, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7106932401657104, + "num_tokens": 431491588.0, + "step": 16687 + }, + { + "epoch": 1.83263782121678, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3871827125549316, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7382155656814575, + "num_tokens": 431517339.0, + "step": 16688 + }, + { + "epoch": 1.8327476389193937, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2780795097351074, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.741480827331543, + "num_tokens": 431543952.0, + "step": 16689 + }, + { + "epoch": 1.8328574566220075, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5687062740325928, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7061935067176819, + "num_tokens": 431566628.0, + "step": 16690 + }, + { + "epoch": 1.8329672743246213, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.342909097671509, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6996556520462036, + "num_tokens": 431595692.0, + "step": 16691 + }, + { + "epoch": 1.8330770920272348, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.603670835494995, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7024413347244263, + "num_tokens": 431617405.0, + "step": 16692 + }, + { + "epoch": 1.8331869097298483, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.672366142272949, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7152178287506104, + "num_tokens": 431639808.0, + "step": 16693 + }, + { + "epoch": 1.833296727432462, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3144876956939697, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7173328995704651, + "num_tokens": 431665541.0, + "step": 16694 + }, + { + "epoch": 1.8334065451350758, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6188721656799316, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7318950295448303, + "num_tokens": 431686833.0, + "step": 16695 + }, + { + "epoch": 1.8335163628376896, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.648120403289795, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7053312063217163, + "num_tokens": 431707512.0, + "step": 16696 + }, + { + "epoch": 1.8336261805403031, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4281811714172363, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7118740081787109, + "num_tokens": 431734053.0, + "step": 16697 + }, + { + "epoch": 1.8337359982429167, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.319594621658325, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7010816335678101, + "num_tokens": 431762374.0, + "step": 16698 + }, + { + "epoch": 1.8338458159455304, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3003652095794678, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7011191844940186, + "num_tokens": 431791432.0, + "step": 16699 + }, + { + "epoch": 1.8339556336481442, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3767337799072266, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7079148888587952, + "num_tokens": 431815212.0, + "step": 16700 + }, + { + "epoch": 1.8340654513507577, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3105862140655518, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7317541837692261, + "num_tokens": 431840246.0, + "step": 16701 + }, + { + "epoch": 1.8341752690533712, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4420721530914307, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7014675140380859, + "num_tokens": 431865239.0, + "step": 16702 + }, + { + "epoch": 1.834285086755985, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.409637451171875, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7033815383911133, + "num_tokens": 431889686.0, + "step": 16703 + }, + { + "epoch": 1.8343949044585988, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2250537872314453, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7019685506820679, + "num_tokens": 431916534.0, + "step": 16704 + }, + { + "epoch": 1.8345047221612125, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.513861656188965, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7058848142623901, + "num_tokens": 431939591.0, + "step": 16705 + }, + { + "epoch": 1.834614539863826, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.428135871887207, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7252506017684937, + "num_tokens": 431962665.0, + "step": 16706 + }, + { + "epoch": 1.8347243575664396, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4625210762023926, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7149413824081421, + "num_tokens": 431986591.0, + "step": 16707 + }, + { + "epoch": 1.8348341752690533, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2920727729797363, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7246068120002747, + "num_tokens": 432012179.0, + "step": 16708 + }, + { + "epoch": 1.834943992971667, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5134732723236084, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7088661193847656, + "num_tokens": 432035559.0, + "step": 16709 + }, + { + "epoch": 1.8350538106742806, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5122039318084717, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7241785526275635, + "num_tokens": 432058218.0, + "step": 16710 + }, + { + "epoch": 1.8351636283768944, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3112921714782715, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.720424234867096, + "num_tokens": 432087561.0, + "step": 16711 + }, + { + "epoch": 1.835273446079508, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.237403154373169, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7241514921188354, + "num_tokens": 432115675.0, + "step": 16712 + }, + { + "epoch": 1.8353832637821217, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.350585460662842, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6958794593811035, + "num_tokens": 432143933.0, + "step": 16713 + }, + { + "epoch": 1.8354930814847354, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2256693840026855, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6985976696014404, + "num_tokens": 432174272.0, + "step": 16714 + }, + { + "epoch": 1.835602899187349, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6591765880584717, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6962013244628906, + "num_tokens": 432196508.0, + "step": 16715 + }, + { + "epoch": 1.8357127168899625, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.559701919555664, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.70556640625, + "num_tokens": 432219663.0, + "step": 16716 + }, + { + "epoch": 1.8358225345925763, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1958820819854736, + "learning_rate": 1e-06, + "loss": 1.152, + "mean_token_accuracy": 0.6729803085327148, + "num_tokens": 432251645.0, + "step": 16717 + }, + { + "epoch": 1.83593235229519, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1956820487976074, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7058566808700562, + "num_tokens": 432279998.0, + "step": 16718 + }, + { + "epoch": 1.8360421699978038, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2206549644470215, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7014985084533691, + "num_tokens": 432309369.0, + "step": 16719 + }, + { + "epoch": 1.8361519877004173, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.304750442504883, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7266343235969543, + "num_tokens": 432335263.0, + "step": 16720 + }, + { + "epoch": 1.8362618054030309, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4310142993927, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7033369541168213, + "num_tokens": 432362386.0, + "step": 16721 + }, + { + "epoch": 1.8363716231056446, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.393383741378784, + "learning_rate": 1e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6839377880096436, + "num_tokens": 432388428.0, + "step": 16722 + }, + { + "epoch": 1.8364814408082584, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3559083938598633, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6959271430969238, + "num_tokens": 432415030.0, + "step": 16723 + }, + { + "epoch": 1.836591258510872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.36440372467041, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7119432687759399, + "num_tokens": 432438623.0, + "step": 16724 + }, + { + "epoch": 1.8367010762134857, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6709723472595215, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7274684906005859, + "num_tokens": 432458752.0, + "step": 16725 + }, + { + "epoch": 1.8368108939160992, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4708900451660156, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7032492756843567, + "num_tokens": 432482944.0, + "step": 16726 + }, + { + "epoch": 1.836920711618713, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2779834270477295, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7092421054840088, + "num_tokens": 432510600.0, + "step": 16727 + }, + { + "epoch": 1.8370305293213267, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.584319591522217, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7333187460899353, + "num_tokens": 432532220.0, + "step": 16728 + }, + { + "epoch": 1.8371403470239402, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5579254627227783, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7176951169967651, + "num_tokens": 432556467.0, + "step": 16729 + }, + { + "epoch": 1.8372501647265538, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1823134422302246, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.695497989654541, + "num_tokens": 432588323.0, + "step": 16730 + }, + { + "epoch": 1.8373599824291675, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5471248626708984, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6964095234870911, + "num_tokens": 432610998.0, + "step": 16731 + }, + { + "epoch": 1.8374698001317813, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3368067741394043, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7027372121810913, + "num_tokens": 432639635.0, + "step": 16732 + }, + { + "epoch": 1.837579617834395, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.409630537033081, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7307554483413696, + "num_tokens": 432665412.0, + "step": 16733 + }, + { + "epoch": 1.8376894355370086, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.786585807800293, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7235182523727417, + "num_tokens": 432684610.0, + "step": 16734 + }, + { + "epoch": 1.8377992532396221, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5571558475494385, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7049660086631775, + "num_tokens": 432707798.0, + "step": 16735 + }, + { + "epoch": 1.8379090709422359, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.478482246398926, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7190395593643188, + "num_tokens": 432731428.0, + "step": 16736 + }, + { + "epoch": 1.8380188886448496, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.53967547416687, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.7022973895072937, + "num_tokens": 432755040.0, + "step": 16737 + }, + { + "epoch": 1.8381287063474632, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6553077697753906, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7360687255859375, + "num_tokens": 432775161.0, + "step": 16738 + }, + { + "epoch": 1.838238524050077, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6473371982574463, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7170339226722717, + "num_tokens": 432797686.0, + "step": 16739 + }, + { + "epoch": 1.8383483417526905, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.057574987411499, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6947463750839233, + "num_tokens": 432831722.0, + "step": 16740 + }, + { + "epoch": 1.8384581594553042, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4225990772247314, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7117428779602051, + "num_tokens": 432857199.0, + "step": 16741 + }, + { + "epoch": 1.838567977157918, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.502478837966919, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6953033208847046, + "num_tokens": 432884640.0, + "step": 16742 + }, + { + "epoch": 1.8386777948605315, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7411632537841797, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7092036008834839, + "num_tokens": 432906955.0, + "step": 16743 + }, + { + "epoch": 1.838787612563145, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1784439086914062, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7141931056976318, + "num_tokens": 432936136.0, + "step": 16744 + }, + { + "epoch": 1.8388974302657588, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.300691843032837, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7079840898513794, + "num_tokens": 432962710.0, + "step": 16745 + }, + { + "epoch": 1.8390072479683726, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5591676235198975, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7140325307846069, + "num_tokens": 432985909.0, + "step": 16746 + }, + { + "epoch": 1.8391170656709863, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.546210527420044, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.723124623298645, + "num_tokens": 433005395.0, + "step": 16747 + }, + { + "epoch": 1.8392268833735999, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3291850090026855, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7261540293693542, + "num_tokens": 433031676.0, + "step": 16748 + }, + { + "epoch": 1.8393367010762134, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.344386339187622, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7106949090957642, + "num_tokens": 433058830.0, + "step": 16749 + }, + { + "epoch": 1.8394465187788271, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.500708818435669, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7192508578300476, + "num_tokens": 433082843.0, + "step": 16750 + }, + { + "epoch": 1.839556336481441, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6512675285339355, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7198942303657532, + "num_tokens": 433102919.0, + "step": 16751 + }, + { + "epoch": 1.8396661541840544, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.889127016067505, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.729637861251831, + "num_tokens": 433120615.0, + "step": 16752 + }, + { + "epoch": 1.839775971886668, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5316805839538574, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7095709443092346, + "num_tokens": 433144967.0, + "step": 16753 + }, + { + "epoch": 1.8398857895892817, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5931687355041504, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7072383761405945, + "num_tokens": 433167433.0, + "step": 16754 + }, + { + "epoch": 1.8399956072918955, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3470611572265625, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7056448459625244, + "num_tokens": 433194566.0, + "step": 16755 + }, + { + "epoch": 1.8401054249945092, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2024242877960205, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7170507311820984, + "num_tokens": 433222826.0, + "step": 16756 + }, + { + "epoch": 1.8402152426971228, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.404531240463257, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7126964330673218, + "num_tokens": 433246856.0, + "step": 16757 + }, + { + "epoch": 1.8403250603997363, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3859007358551025, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7223845720291138, + "num_tokens": 433270364.0, + "step": 16758 + }, + { + "epoch": 1.84043487810235, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3167197704315186, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.693942666053772, + "num_tokens": 433298091.0, + "step": 16759 + }, + { + "epoch": 1.8405446958049638, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1966068744659424, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.734534740447998, + "num_tokens": 433326190.0, + "step": 16760 + }, + { + "epoch": 1.8406545135075776, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1208322048187256, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7131478786468506, + "num_tokens": 433356604.0, + "step": 16761 + }, + { + "epoch": 1.8407643312101911, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.329822301864624, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7003345489501953, + "num_tokens": 433383447.0, + "step": 16762 + }, + { + "epoch": 1.8408741489128047, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3022801876068115, + "learning_rate": 1e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.686416745185852, + "num_tokens": 433411016.0, + "step": 16763 + }, + { + "epoch": 1.8409839666154184, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3159399032592773, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7010244727134705, + "num_tokens": 433437278.0, + "step": 16764 + }, + { + "epoch": 1.8410937843180322, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5588507652282715, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7227075099945068, + "num_tokens": 433458780.0, + "step": 16765 + }, + { + "epoch": 1.8412036020206457, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.305026054382324, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6957858204841614, + "num_tokens": 433483918.0, + "step": 16766 + }, + { + "epoch": 1.8413134197232592, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.566798448562622, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7155316472053528, + "num_tokens": 433505207.0, + "step": 16767 + }, + { + "epoch": 1.841423237425873, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.477970838546753, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7361800074577332, + "num_tokens": 433527139.0, + "step": 16768 + }, + { + "epoch": 1.8415330551284868, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4287688732147217, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7077196836471558, + "num_tokens": 433552161.0, + "step": 16769 + }, + { + "epoch": 1.8416428728311005, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.461300849914551, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.6858022212982178, + "num_tokens": 433578670.0, + "step": 16770 + }, + { + "epoch": 1.841752690533714, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5164361000061035, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7034133076667786, + "num_tokens": 433599905.0, + "step": 16771 + }, + { + "epoch": 1.8418625082363276, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.340275526046753, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6944081783294678, + "num_tokens": 433628044.0, + "step": 16772 + }, + { + "epoch": 1.8419723259389413, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.329045295715332, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7092870473861694, + "num_tokens": 433654532.0, + "step": 16773 + }, + { + "epoch": 1.842082143641555, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.101729154586792, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7170271873474121, + "num_tokens": 433685377.0, + "step": 16774 + }, + { + "epoch": 1.8421919613441686, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4544801712036133, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7098459005355835, + "num_tokens": 433708832.0, + "step": 16775 + }, + { + "epoch": 1.8423017790467824, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.339423894882202, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7377199530601501, + "num_tokens": 433733598.0, + "step": 16776 + }, + { + "epoch": 1.842411596749396, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.430858612060547, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7288997173309326, + "num_tokens": 433757291.0, + "step": 16777 + }, + { + "epoch": 1.8425214144520097, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.329556941986084, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7232320308685303, + "num_tokens": 433781578.0, + "step": 16778 + }, + { + "epoch": 1.8426312321546234, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3554277420043945, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6964264512062073, + "num_tokens": 433807409.0, + "step": 16779 + }, + { + "epoch": 1.842741049857237, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1202471256256104, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7099937200546265, + "num_tokens": 433836391.0, + "step": 16780 + }, + { + "epoch": 1.8428508675598505, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1594085693359375, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6869449019432068, + "num_tokens": 433869430.0, + "step": 16781 + }, + { + "epoch": 1.8429606852624643, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1066598892211914, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7134353518486023, + "num_tokens": 433899256.0, + "step": 16782 + }, + { + "epoch": 1.843070502965078, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.469407081604004, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7040344476699829, + "num_tokens": 433922331.0, + "step": 16783 + }, + { + "epoch": 1.8431803206676918, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1094799041748047, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7094371318817139, + "num_tokens": 433952389.0, + "step": 16784 + }, + { + "epoch": 1.8432901383703053, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6597859859466553, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7244805097579956, + "num_tokens": 433972878.0, + "step": 16785 + }, + { + "epoch": 1.8433999560729188, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.612215042114258, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7214934825897217, + "num_tokens": 433996169.0, + "step": 16786 + }, + { + "epoch": 1.8435097737755326, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2384259700775146, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7362545728683472, + "num_tokens": 434023402.0, + "step": 16787 + }, + { + "epoch": 1.8436195914781464, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3286190032958984, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7165484428405762, + "num_tokens": 434048443.0, + "step": 16788 + }, + { + "epoch": 1.84372940918076, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.338243246078491, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7238085269927979, + "num_tokens": 434074861.0, + "step": 16789 + }, + { + "epoch": 1.8438392268833736, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2793498039245605, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6962894201278687, + "num_tokens": 434105980.0, + "step": 16790 + }, + { + "epoch": 1.8439490445859872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.473083972930908, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7111465930938721, + "num_tokens": 434129951.0, + "step": 16791 + }, + { + "epoch": 1.844058862288601, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5886714458465576, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7326374053955078, + "num_tokens": 434151178.0, + "step": 16792 + }, + { + "epoch": 1.8441686799912147, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3111965656280518, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7226951122283936, + "num_tokens": 434179016.0, + "step": 16793 + }, + { + "epoch": 1.8442784976938282, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.8668630123138428, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7035407423973083, + "num_tokens": 434198588.0, + "step": 16794 + }, + { + "epoch": 1.8443883153964418, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.590604782104492, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.730951189994812, + "num_tokens": 434220242.0, + "step": 16795 + }, + { + "epoch": 1.8444981330990555, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2980754375457764, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7066680192947388, + "num_tokens": 434246939.0, + "step": 16796 + }, + { + "epoch": 1.8446079508016693, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.227118730545044, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6856945753097534, + "num_tokens": 434275946.0, + "step": 16797 + }, + { + "epoch": 1.844717768504283, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.56815505027771, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.7024253606796265, + "num_tokens": 434299740.0, + "step": 16798 + }, + { + "epoch": 1.8448275862068966, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.170543670654297, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6895357966423035, + "num_tokens": 434330720.0, + "step": 16799 + }, + { + "epoch": 1.84493740390951, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1960952281951904, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7075051069259644, + "num_tokens": 434359474.0, + "step": 16800 + }, + { + "epoch": 1.8450472216121239, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3788466453552246, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7191684246063232, + "num_tokens": 434383772.0, + "step": 16801 + }, + { + "epoch": 1.8451570393147376, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.178908586502075, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7107044458389282, + "num_tokens": 434412512.0, + "step": 16802 + }, + { + "epoch": 1.8452668570173512, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4193408489227295, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7113415598869324, + "num_tokens": 434435345.0, + "step": 16803 + }, + { + "epoch": 1.8453766747199647, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.428110361099243, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7060154676437378, + "num_tokens": 434459575.0, + "step": 16804 + }, + { + "epoch": 1.8454864924225785, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4426772594451904, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7020336985588074, + "num_tokens": 434487834.0, + "step": 16805 + }, + { + "epoch": 1.8455963101251922, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6840097904205322, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7141844034194946, + "num_tokens": 434508043.0, + "step": 16806 + }, + { + "epoch": 1.845706127827806, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3878533840179443, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7259553670883179, + "num_tokens": 434531909.0, + "step": 16807 + }, + { + "epoch": 1.8458159455304195, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4054768085479736, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7085155248641968, + "num_tokens": 434556812.0, + "step": 16808 + }, + { + "epoch": 1.845925763233033, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2866766452789307, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7188434600830078, + "num_tokens": 434585807.0, + "step": 16809 + }, + { + "epoch": 1.8460355809356468, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.105142831802368, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7033618688583374, + "num_tokens": 434619430.0, + "step": 16810 + }, + { + "epoch": 1.8461453986382605, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4537904262542725, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7061509490013123, + "num_tokens": 434644649.0, + "step": 16811 + }, + { + "epoch": 1.8462552163408743, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.125420331954956, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7142110466957092, + "num_tokens": 434675536.0, + "step": 16812 + }, + { + "epoch": 1.8463650340434878, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3405749797821045, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7079121470451355, + "num_tokens": 434702111.0, + "step": 16813 + }, + { + "epoch": 1.8464748517461014, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3049535751342773, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7313803434371948, + "num_tokens": 434727742.0, + "step": 16814 + }, + { + "epoch": 1.8465846694487151, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.354578971862793, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7138330936431885, + "num_tokens": 434753236.0, + "step": 16815 + }, + { + "epoch": 1.846694487151329, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.334212303161621, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7003477811813354, + "num_tokens": 434780497.0, + "step": 16816 + }, + { + "epoch": 1.8468043048539424, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.661527633666992, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7289982438087463, + "num_tokens": 434801305.0, + "step": 16817 + }, + { + "epoch": 1.846914122556556, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.472707509994507, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7276606559753418, + "num_tokens": 434825040.0, + "step": 16818 + }, + { + "epoch": 1.8470239402591697, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6528515815734863, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6996827125549316, + "num_tokens": 434848252.0, + "step": 16819 + }, + { + "epoch": 1.8471337579617835, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4390218257904053, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7042253017425537, + "num_tokens": 434874096.0, + "step": 16820 + }, + { + "epoch": 1.8472435756643972, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5306215286254883, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7330570220947266, + "num_tokens": 434894708.0, + "step": 16821 + }, + { + "epoch": 1.8473533933670108, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.215261936187744, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7076084613800049, + "num_tokens": 434923205.0, + "step": 16822 + }, + { + "epoch": 1.8474632110696243, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6291122436523438, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7329035401344299, + "num_tokens": 434944794.0, + "step": 16823 + }, + { + "epoch": 1.847573028772238, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.333428382873535, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7028083205223083, + "num_tokens": 434969538.0, + "step": 16824 + }, + { + "epoch": 1.8476828464748518, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5767555236816406, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.707127571105957, + "num_tokens": 434991412.0, + "step": 16825 + }, + { + "epoch": 1.8477926641774656, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2918951511383057, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6895573139190674, + "num_tokens": 435021016.0, + "step": 16826 + }, + { + "epoch": 1.847902481880079, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5230259895324707, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6996954083442688, + "num_tokens": 435044767.0, + "step": 16827 + }, + { + "epoch": 1.8480122995826926, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4587607383728027, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7157431244850159, + "num_tokens": 435068460.0, + "step": 16828 + }, + { + "epoch": 1.8481221172853064, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3860585689544678, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7116396427154541, + "num_tokens": 435092176.0, + "step": 16829 + }, + { + "epoch": 1.8482319349879202, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.203348398208618, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7011731863021851, + "num_tokens": 435122503.0, + "step": 16830 + }, + { + "epoch": 1.8483417526905337, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3976004123687744, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.711715579032898, + "num_tokens": 435147829.0, + "step": 16831 + }, + { + "epoch": 1.8484515703931472, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4599316120147705, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6971637010574341, + "num_tokens": 435170486.0, + "step": 16832 + }, + { + "epoch": 1.848561388095761, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3058276176452637, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.7069523334503174, + "num_tokens": 435198505.0, + "step": 16833 + }, + { + "epoch": 1.8486712057983747, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6711442470550537, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7087182998657227, + "num_tokens": 435220991.0, + "step": 16834 + }, + { + "epoch": 1.8487810235009885, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.373055934906006, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.7053374648094177, + "num_tokens": 435247749.0, + "step": 16835 + }, + { + "epoch": 1.848890841203602, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.815786361694336, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7048090696334839, + "num_tokens": 435266982.0, + "step": 16836 + }, + { + "epoch": 1.8490006589062156, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7238731384277344, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7323275804519653, + "num_tokens": 435286124.0, + "step": 16837 + }, + { + "epoch": 1.8491104766088293, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.442702293395996, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7368167638778687, + "num_tokens": 435308497.0, + "step": 16838 + }, + { + "epoch": 1.849220294311443, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.291006565093994, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6753408908843994, + "num_tokens": 435337722.0, + "step": 16839 + }, + { + "epoch": 1.8493301120140566, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6473305225372314, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7264832854270935, + "num_tokens": 435358826.0, + "step": 16840 + }, + { + "epoch": 1.8494399297166704, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.680400848388672, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6954087615013123, + "num_tokens": 435383016.0, + "step": 16841 + }, + { + "epoch": 1.849549747419284, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.393321990966797, + "learning_rate": 1e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.6938297152519226, + "num_tokens": 435411040.0, + "step": 16842 + }, + { + "epoch": 1.8496595651218977, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.39619779586792, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.724373459815979, + "num_tokens": 435437714.0, + "step": 16843 + }, + { + "epoch": 1.8497693828245114, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.29156231880188, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.711876392364502, + "num_tokens": 435463892.0, + "step": 16844 + }, + { + "epoch": 1.849879200527125, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1145107746124268, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.69437175989151, + "num_tokens": 435494700.0, + "step": 16845 + }, + { + "epoch": 1.8499890182297385, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.774739980697632, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7195543050765991, + "num_tokens": 435514755.0, + "step": 16846 + }, + { + "epoch": 1.8500988359323522, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.808354377746582, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7352374196052551, + "num_tokens": 435534376.0, + "step": 16847 + }, + { + "epoch": 1.850208653634966, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7608883380889893, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7064864635467529, + "num_tokens": 435557910.0, + "step": 16848 + }, + { + "epoch": 1.8503184713375798, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.109816789627075, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6947348713874817, + "num_tokens": 435593600.0, + "step": 16849 + }, + { + "epoch": 1.8504282890401933, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.609562635421753, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6897188425064087, + "num_tokens": 435615993.0, + "step": 16850 + }, + { + "epoch": 1.8505381067428068, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.317683696746826, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.705165445804596, + "num_tokens": 435640678.0, + "step": 16851 + }, + { + "epoch": 1.8506479244454206, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3946850299835205, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6972767114639282, + "num_tokens": 435666368.0, + "step": 16852 + }, + { + "epoch": 1.8507577421480343, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.646303176879883, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.721624493598938, + "num_tokens": 435686611.0, + "step": 16853 + }, + { + "epoch": 1.8508675598506479, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2680633068084717, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.710256814956665, + "num_tokens": 435715453.0, + "step": 16854 + }, + { + "epoch": 1.8509773775532616, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5332798957824707, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6908698678016663, + "num_tokens": 435739985.0, + "step": 16855 + }, + { + "epoch": 1.8510871952558752, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.293424129486084, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7011190056800842, + "num_tokens": 435767221.0, + "step": 16856 + }, + { + "epoch": 1.851197012958489, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5578453540802, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.735146164894104, + "num_tokens": 435790554.0, + "step": 16857 + }, + { + "epoch": 1.8513068306611027, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.261591672897339, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7181289792060852, + "num_tokens": 435817734.0, + "step": 16858 + }, + { + "epoch": 1.8514166483637162, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 32.325904846191406, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.69578617811203, + "num_tokens": 435845409.0, + "step": 16859 + }, + { + "epoch": 1.8515264660663298, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6173553466796875, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7142503261566162, + "num_tokens": 435867146.0, + "step": 16860 + }, + { + "epoch": 1.8516362837689435, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.381899118423462, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7059780955314636, + "num_tokens": 435896313.0, + "step": 16861 + }, + { + "epoch": 1.8517461014715573, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7487525939941406, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7053947448730469, + "num_tokens": 435916873.0, + "step": 16862 + }, + { + "epoch": 1.851855919174171, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.33882999420166, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7259469032287598, + "num_tokens": 435940352.0, + "step": 16863 + }, + { + "epoch": 1.8519657368767846, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.385707378387451, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7203936576843262, + "num_tokens": 435965409.0, + "step": 16864 + }, + { + "epoch": 1.852075554579398, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.511280059814453, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7265310287475586, + "num_tokens": 435989123.0, + "step": 16865 + }, + { + "epoch": 1.8521853722820119, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6059935092926025, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7279276847839355, + "num_tokens": 436010896.0, + "step": 16866 + }, + { + "epoch": 1.8522951899846256, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.39678955078125, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.7004545331001282, + "num_tokens": 436037438.0, + "step": 16867 + }, + { + "epoch": 1.8524050076872391, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.712846517562866, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7015929222106934, + "num_tokens": 436056645.0, + "step": 16868 + }, + { + "epoch": 1.8525148253898527, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4063303470611572, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.691959023475647, + "num_tokens": 436084261.0, + "step": 16869 + }, + { + "epoch": 1.8526246430924664, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1982157230377197, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7000960111618042, + "num_tokens": 436115170.0, + "step": 16870 + }, + { + "epoch": 1.8527344607950802, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5382680892944336, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7142465114593506, + "num_tokens": 436138017.0, + "step": 16871 + }, + { + "epoch": 1.852844278497694, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4784135818481445, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6973301768302917, + "num_tokens": 436164826.0, + "step": 16872 + }, + { + "epoch": 1.8529540962003075, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6751275062561035, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7287455797195435, + "num_tokens": 436186950.0, + "step": 16873 + }, + { + "epoch": 1.853063913902921, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1617271900177, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6927292346954346, + "num_tokens": 436218554.0, + "step": 16874 + }, + { + "epoch": 1.8531737316055348, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3501975536346436, + "learning_rate": 1e-06, + "loss": 1.1292, + "mean_token_accuracy": 0.6782553791999817, + "num_tokens": 436245732.0, + "step": 16875 + }, + { + "epoch": 1.8532835493081485, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0763068199157715, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7232160568237305, + "num_tokens": 436275907.0, + "step": 16876 + }, + { + "epoch": 1.8533933670107623, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.517883539199829, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7393832802772522, + "num_tokens": 436298559.0, + "step": 16877 + }, + { + "epoch": 1.8535031847133758, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1958062648773193, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6994894742965698, + "num_tokens": 436328711.0, + "step": 16878 + }, + { + "epoch": 1.8536130024159894, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.581801414489746, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7142819166183472, + "num_tokens": 436351002.0, + "step": 16879 + }, + { + "epoch": 1.8537228201186031, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4322738647460938, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6982115507125854, + "num_tokens": 436376844.0, + "step": 16880 + }, + { + "epoch": 1.8538326378212169, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.344890594482422, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6895221471786499, + "num_tokens": 436403455.0, + "step": 16881 + }, + { + "epoch": 1.8539424555238304, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.424386739730835, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7077387571334839, + "num_tokens": 436427524.0, + "step": 16882 + }, + { + "epoch": 1.854052273226444, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3112144470214844, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7177770137786865, + "num_tokens": 436454649.0, + "step": 16883 + }, + { + "epoch": 1.8541620909290577, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.454359769821167, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.725718080997467, + "num_tokens": 436477195.0, + "step": 16884 + }, + { + "epoch": 1.8542719086316715, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.202645778656006, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.710624098777771, + "num_tokens": 436505904.0, + "step": 16885 + }, + { + "epoch": 1.8543817263342852, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6852235794067383, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7082010507583618, + "num_tokens": 436531585.0, + "step": 16886 + }, + { + "epoch": 1.8544915440368988, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3033676147460938, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6902024745941162, + "num_tokens": 436563798.0, + "step": 16887 + }, + { + "epoch": 1.8546013617395123, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.220140218734741, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7094178199768066, + "num_tokens": 436592580.0, + "step": 16888 + }, + { + "epoch": 1.854711179442126, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4508743286132812, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7250522375106812, + "num_tokens": 436615814.0, + "step": 16889 + }, + { + "epoch": 1.8548209971447398, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3046321868896484, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7167236804962158, + "num_tokens": 436642617.0, + "step": 16890 + }, + { + "epoch": 1.8549308148473536, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.495896816253662, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7003418207168579, + "num_tokens": 436666971.0, + "step": 16891 + }, + { + "epoch": 1.855040632549967, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3029632568359375, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.697769045829773, + "num_tokens": 436695065.0, + "step": 16892 + }, + { + "epoch": 1.8551504502525806, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.18979549407959, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7042858600616455, + "num_tokens": 436724658.0, + "step": 16893 + }, + { + "epoch": 1.8552602679551944, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3529160022735596, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7087466716766357, + "num_tokens": 436750097.0, + "step": 16894 + }, + { + "epoch": 1.8553700856578081, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3763651847839355, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7204177379608154, + "num_tokens": 436774023.0, + "step": 16895 + }, + { + "epoch": 1.8554799033604217, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.270878553390503, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7094701528549194, + "num_tokens": 436800885.0, + "step": 16896 + }, + { + "epoch": 1.8555897210630352, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7148094177246094, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7026995420455933, + "num_tokens": 436824212.0, + "step": 16897 + }, + { + "epoch": 1.855699538765649, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.8041446208953857, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7290786504745483, + "num_tokens": 436843050.0, + "step": 16898 + }, + { + "epoch": 1.8558093564682627, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2364723682403564, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6903404593467712, + "num_tokens": 436880926.0, + "step": 16899 + }, + { + "epoch": 1.8559191741708765, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3049163818359375, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.70587158203125, + "num_tokens": 436908418.0, + "step": 16900 + }, + { + "epoch": 1.85602899187349, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4362964630126953, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7305229306221008, + "num_tokens": 436933281.0, + "step": 16901 + }, + { + "epoch": 1.8561388095761036, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.219417095184326, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7127470970153809, + "num_tokens": 436959755.0, + "step": 16902 + }, + { + "epoch": 1.8562486272787173, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3985490798950195, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7406115531921387, + "num_tokens": 436983183.0, + "step": 16903 + }, + { + "epoch": 1.856358444981331, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7669031620025635, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.752060055732727, + "num_tokens": 437000635.0, + "step": 16904 + }, + { + "epoch": 1.8564682626839446, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3193838596343994, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.7031018733978271, + "num_tokens": 437025503.0, + "step": 16905 + }, + { + "epoch": 1.8565780803865584, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4367969036102295, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.714169979095459, + "num_tokens": 437051630.0, + "step": 16906 + }, + { + "epoch": 1.856687898089172, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.449676036834717, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7326580286026001, + "num_tokens": 437076770.0, + "step": 16907 + }, + { + "epoch": 1.8567977157917857, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1571319103240967, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7174490690231323, + "num_tokens": 437108400.0, + "step": 16908 + }, + { + "epoch": 1.8569075334943994, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.124051570892334, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7006090879440308, + "num_tokens": 437140555.0, + "step": 16909 + }, + { + "epoch": 1.857017351197013, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5797760486602783, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7152339220046997, + "num_tokens": 437162231.0, + "step": 16910 + }, + { + "epoch": 1.8571271688996265, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2266993522644043, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7055506706237793, + "num_tokens": 437188757.0, + "step": 16911 + }, + { + "epoch": 1.8572369866022402, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2053656578063965, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7342149019241333, + "num_tokens": 437215715.0, + "step": 16912 + }, + { + "epoch": 1.857346804304854, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.35947322845459, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7143704891204834, + "num_tokens": 437240293.0, + "step": 16913 + }, + { + "epoch": 1.8574566220074678, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1754045486450195, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7198436260223389, + "num_tokens": 437271806.0, + "step": 16914 + }, + { + "epoch": 1.8575664397100813, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5404608249664307, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7258278131484985, + "num_tokens": 437295395.0, + "step": 16915 + }, + { + "epoch": 1.8576762574126948, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7236533164978027, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7086941003799438, + "num_tokens": 437316129.0, + "step": 16916 + }, + { + "epoch": 1.8577860751153086, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3602213859558105, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6926617622375488, + "num_tokens": 437342790.0, + "step": 16917 + }, + { + "epoch": 1.8578958928179223, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.181596279144287, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6906977891921997, + "num_tokens": 437372755.0, + "step": 16918 + }, + { + "epoch": 1.8580057105205359, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0423617362976074, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7064987421035767, + "num_tokens": 437406550.0, + "step": 16919 + }, + { + "epoch": 1.8581155282231496, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5008976459503174, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7373489141464233, + "num_tokens": 437428676.0, + "step": 16920 + }, + { + "epoch": 1.8582253459257632, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3611347675323486, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6923856735229492, + "num_tokens": 437455018.0, + "step": 16921 + }, + { + "epoch": 1.858335163628377, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5084903240203857, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6959162354469299, + "num_tokens": 437479847.0, + "step": 16922 + }, + { + "epoch": 1.8584449813309907, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.410382032394409, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.698732852935791, + "num_tokens": 437507113.0, + "step": 16923 + }, + { + "epoch": 1.8585547990336042, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6226541996002197, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6982501745223999, + "num_tokens": 437529316.0, + "step": 16924 + }, + { + "epoch": 1.8586646167362177, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.198371648788452, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6824988126754761, + "num_tokens": 437560522.0, + "step": 16925 + }, + { + "epoch": 1.8587744344388315, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.218902587890625, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6889652013778687, + "num_tokens": 437590821.0, + "step": 16926 + }, + { + "epoch": 1.8588842521414453, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4172544479370117, + "learning_rate": 1e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.6936886310577393, + "num_tokens": 437619416.0, + "step": 16927 + }, + { + "epoch": 1.858994069844059, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.588374137878418, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6926038265228271, + "num_tokens": 437643616.0, + "step": 16928 + }, + { + "epoch": 1.8591038875466726, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5229504108428955, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7198100686073303, + "num_tokens": 437670853.0, + "step": 16929 + }, + { + "epoch": 1.859213705249286, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4867613315582275, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7188441753387451, + "num_tokens": 437692695.0, + "step": 16930 + }, + { + "epoch": 1.8593235229518998, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4067795276641846, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7163533568382263, + "num_tokens": 437719371.0, + "step": 16931 + }, + { + "epoch": 1.8594333406545136, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.42236590385437, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7230032682418823, + "num_tokens": 437743999.0, + "step": 16932 + }, + { + "epoch": 1.8595431583571271, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.470449924468994, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7015913724899292, + "num_tokens": 437766725.0, + "step": 16933 + }, + { + "epoch": 1.8596529760597407, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4397759437561035, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7367194294929504, + "num_tokens": 437790488.0, + "step": 16934 + }, + { + "epoch": 1.8597627937623544, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5454294681549072, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7031300663948059, + "num_tokens": 437814868.0, + "step": 16935 + }, + { + "epoch": 1.8598726114649682, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.664722442626953, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7160181999206543, + "num_tokens": 437834285.0, + "step": 16936 + }, + { + "epoch": 1.859982429167582, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.247790575027466, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6895655989646912, + "num_tokens": 437862652.0, + "step": 16937 + }, + { + "epoch": 1.8600922468701955, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3461461067199707, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7094003558158875, + "num_tokens": 437886820.0, + "step": 16938 + }, + { + "epoch": 1.860202064572809, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1030666828155518, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7179523706436157, + "num_tokens": 437919256.0, + "step": 16939 + }, + { + "epoch": 1.8603118822754228, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3608884811401367, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6930995583534241, + "num_tokens": 437947549.0, + "step": 16940 + }, + { + "epoch": 1.8604216999780365, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4985504150390625, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6960214972496033, + "num_tokens": 437972843.0, + "step": 16941 + }, + { + "epoch": 1.8605315176806503, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.501333236694336, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7178317308425903, + "num_tokens": 437998600.0, + "step": 16942 + }, + { + "epoch": 1.8606413353832638, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1878890991210938, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7127652168273926, + "num_tokens": 438028102.0, + "step": 16943 + }, + { + "epoch": 1.8607511530858774, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.469665765762329, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6968109011650085, + "num_tokens": 438052822.0, + "step": 16944 + }, + { + "epoch": 1.860860970788491, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.25144624710083, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7021806240081787, + "num_tokens": 438082844.0, + "step": 16945 + }, + { + "epoch": 1.8609707884911049, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4312376976013184, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7099353075027466, + "num_tokens": 438109890.0, + "step": 16946 + }, + { + "epoch": 1.8610806061937184, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.1200454235076904, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7088515758514404, + "num_tokens": 438140895.0, + "step": 16947 + }, + { + "epoch": 1.861190423896332, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.378657102584839, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7237687110900879, + "num_tokens": 438165806.0, + "step": 16948 + }, + { + "epoch": 1.8613002415989457, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.543191909790039, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7239759564399719, + "num_tokens": 438190317.0, + "step": 16949 + }, + { + "epoch": 1.8614100593015594, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3484647274017334, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7062699794769287, + "num_tokens": 438215540.0, + "step": 16950 + }, + { + "epoch": 1.8615198770041732, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.432403564453125, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7213675379753113, + "num_tokens": 438239349.0, + "step": 16951 + }, + { + "epoch": 1.8616296947067867, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3123602867126465, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6930782794952393, + "num_tokens": 438272407.0, + "step": 16952 + }, + { + "epoch": 1.8617395124094003, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6606040000915527, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7434613704681396, + "num_tokens": 438292060.0, + "step": 16953 + }, + { + "epoch": 1.861849330112014, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.123704195022583, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6978723406791687, + "num_tokens": 438323163.0, + "step": 16954 + }, + { + "epoch": 1.8619591478146278, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.329854965209961, + "learning_rate": 1e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6905145645141602, + "num_tokens": 438349393.0, + "step": 16955 + }, + { + "epoch": 1.8620689655172413, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3414130210876465, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7131849527359009, + "num_tokens": 438374889.0, + "step": 16956 + }, + { + "epoch": 1.862178783219855, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.209482192993164, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6980659365653992, + "num_tokens": 438404116.0, + "step": 16957 + }, + { + "epoch": 1.8622886009224686, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4762520790100098, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7085119485855103, + "num_tokens": 438428559.0, + "step": 16958 + }, + { + "epoch": 1.8623984186250824, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.629110097885132, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6980360150337219, + "num_tokens": 438452106.0, + "step": 16959 + }, + { + "epoch": 1.8625082363276961, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4069664478302, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7067604064941406, + "num_tokens": 438476888.0, + "step": 16960 + }, + { + "epoch": 1.8626180540303097, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.117249011993408, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6880510449409485, + "num_tokens": 438509799.0, + "step": 16961 + }, + { + "epoch": 1.8627278717329232, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4010064601898193, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7134014964103699, + "num_tokens": 438533819.0, + "step": 16962 + }, + { + "epoch": 1.862837689435537, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.362309455871582, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6965914964675903, + "num_tokens": 438559396.0, + "step": 16963 + }, + { + "epoch": 1.8629475071381507, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4122395515441895, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7431803345680237, + "num_tokens": 438581493.0, + "step": 16964 + }, + { + "epoch": 1.8630573248407645, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5555331707000732, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.731410562992096, + "num_tokens": 438603969.0, + "step": 16965 + }, + { + "epoch": 1.863167142543378, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.304759979248047, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7257118821144104, + "num_tokens": 438629259.0, + "step": 16966 + }, + { + "epoch": 1.8632769602459915, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1996748447418213, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6784316301345825, + "num_tokens": 438659759.0, + "step": 16967 + }, + { + "epoch": 1.8633867779486053, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0864040851593018, + "learning_rate": 1e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6861941814422607, + "num_tokens": 438693993.0, + "step": 16968 + }, + { + "epoch": 1.863496595651219, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4366724491119385, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7227016687393188, + "num_tokens": 438717877.0, + "step": 16969 + }, + { + "epoch": 1.8636064133538326, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3748409748077393, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7122555375099182, + "num_tokens": 438741555.0, + "step": 16970 + }, + { + "epoch": 1.8637162310564463, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2055675983428955, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.7002121806144714, + "num_tokens": 438772483.0, + "step": 16971 + }, + { + "epoch": 1.8638260487590599, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4251980781555176, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7266746163368225, + "num_tokens": 438796946.0, + "step": 16972 + }, + { + "epoch": 1.8639358664616736, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.06935715675354, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6987985372543335, + "num_tokens": 438828630.0, + "step": 16973 + }, + { + "epoch": 1.8640456841642874, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4714558124542236, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7164919376373291, + "num_tokens": 438854638.0, + "step": 16974 + }, + { + "epoch": 1.864155501866901, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.340054750442505, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7226403951644897, + "num_tokens": 438880277.0, + "step": 16975 + }, + { + "epoch": 1.8642653195695145, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0467312335968018, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6988641023635864, + "num_tokens": 438913785.0, + "step": 16976 + }, + { + "epoch": 1.8643751372721282, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6357715129852295, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7215986847877502, + "num_tokens": 438935297.0, + "step": 16977 + }, + { + "epoch": 1.864484954974742, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.30208158493042, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7138209342956543, + "num_tokens": 438960935.0, + "step": 16978 + }, + { + "epoch": 1.8645947726773557, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4744763374328613, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7150112390518188, + "num_tokens": 438985536.0, + "step": 16979 + }, + { + "epoch": 1.8647045903799693, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2846415042877197, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6864689588546753, + "num_tokens": 439014239.0, + "step": 16980 + }, + { + "epoch": 1.8648144080825828, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.365825891494751, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7113282084465027, + "num_tokens": 439039532.0, + "step": 16981 + }, + { + "epoch": 1.8649242257851966, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1734540462493896, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7096375226974487, + "num_tokens": 439070929.0, + "step": 16982 + }, + { + "epoch": 1.8650340434878103, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.491074562072754, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7152242660522461, + "num_tokens": 439093825.0, + "step": 16983 + }, + { + "epoch": 1.8651438611904239, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4855434894561768, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7028357982635498, + "num_tokens": 439117753.0, + "step": 16984 + }, + { + "epoch": 1.8652536788930374, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.215064764022827, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7412905693054199, + "num_tokens": 439144930.0, + "step": 16985 + }, + { + "epoch": 1.8653634965956511, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7478318214416504, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7109199166297913, + "num_tokens": 439165820.0, + "step": 16986 + }, + { + "epoch": 1.865473314298265, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5856804847717285, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7291591167449951, + "num_tokens": 439188198.0, + "step": 16987 + }, + { + "epoch": 1.8655831320008787, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2321181297302246, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7251618504524231, + "num_tokens": 439215580.0, + "step": 16988 + }, + { + "epoch": 1.8656929497034922, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.4057812690734863, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7193567752838135, + "num_tokens": 439238300.0, + "step": 16989 + }, + { + "epoch": 1.8658027674061057, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.413684368133545, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.699688196182251, + "num_tokens": 439263492.0, + "step": 16990 + }, + { + "epoch": 1.8659125851087195, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4257216453552246, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7049189209938049, + "num_tokens": 439290625.0, + "step": 16991 + }, + { + "epoch": 1.8660224028113332, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.42565655708313, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6860558390617371, + "num_tokens": 439316988.0, + "step": 16992 + }, + { + "epoch": 1.866132220513947, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7248013019561768, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7211989760398865, + "num_tokens": 439338441.0, + "step": 16993 + }, + { + "epoch": 1.8662420382165605, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.9915432929992676, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7220420837402344, + "num_tokens": 439359117.0, + "step": 16994 + }, + { + "epoch": 1.866351855919174, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4253652095794678, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7038586139678955, + "num_tokens": 439383259.0, + "step": 16995 + }, + { + "epoch": 1.8664616736217878, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.342343807220459, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7066611647605896, + "num_tokens": 439407786.0, + "step": 16996 + }, + { + "epoch": 1.8665714913244016, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.204808473587036, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7018085718154907, + "num_tokens": 439437130.0, + "step": 16997 + }, + { + "epoch": 1.8666813090270151, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2033634185791016, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7282389402389526, + "num_tokens": 439465442.0, + "step": 16998 + }, + { + "epoch": 1.8667911267296287, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2754805088043213, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7057283520698547, + "num_tokens": 439494648.0, + "step": 16999 + }, + { + "epoch": 1.8669009444322424, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3148856163024902, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7147488594055176, + "num_tokens": 439520459.0, + "step": 17000 + }, + { + "epoch": 1.8670107621348562, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.423699140548706, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6980594992637634, + "num_tokens": 439546501.0, + "step": 17001 + }, + { + "epoch": 1.86712057983747, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1625819206237793, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.715605616569519, + "num_tokens": 439576103.0, + "step": 17002 + }, + { + "epoch": 1.8672303975400835, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.8621113300323486, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7058274745941162, + "num_tokens": 439597541.0, + "step": 17003 + }, + { + "epoch": 1.867340215242697, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.227137804031372, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7192988395690918, + "num_tokens": 439627409.0, + "step": 17004 + }, + { + "epoch": 1.8674500329453108, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5851638317108154, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7051408886909485, + "num_tokens": 439649990.0, + "step": 17005 + }, + { + "epoch": 1.8675598506479245, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.381662368774414, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7196251153945923, + "num_tokens": 439676100.0, + "step": 17006 + }, + { + "epoch": 1.8676696683505383, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3909659385681152, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6976324915885925, + "num_tokens": 439701954.0, + "step": 17007 + }, + { + "epoch": 1.8677794860531518, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3464181423187256, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7102499008178711, + "num_tokens": 439729123.0, + "step": 17008 + }, + { + "epoch": 1.8678893037557653, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.317261219024658, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.691346287727356, + "num_tokens": 439755615.0, + "step": 17009 + }, + { + "epoch": 1.867999121458379, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.220629930496216, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.700390100479126, + "num_tokens": 439786232.0, + "step": 17010 + }, + { + "epoch": 1.8681089391609929, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.428637981414795, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7155733108520508, + "num_tokens": 439810890.0, + "step": 17011 + }, + { + "epoch": 1.8682187568636064, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5061731338500977, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7062878012657166, + "num_tokens": 439833482.0, + "step": 17012 + }, + { + "epoch": 1.86832857456622, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.262087821960449, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7173599004745483, + "num_tokens": 439859092.0, + "step": 17013 + }, + { + "epoch": 1.8684383922688337, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.281867027282715, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7123308777809143, + "num_tokens": 439886906.0, + "step": 17014 + }, + { + "epoch": 1.8685482099714474, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.323007822036743, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.697364091873169, + "num_tokens": 439912722.0, + "step": 17015 + }, + { + "epoch": 1.8686580276740612, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.457599401473999, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7228326201438904, + "num_tokens": 439936368.0, + "step": 17016 + }, + { + "epoch": 1.8687678453766747, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5658726692199707, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.712709367275238, + "num_tokens": 439961316.0, + "step": 17017 + }, + { + "epoch": 1.8688776630792883, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.368262767791748, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.68880295753479, + "num_tokens": 439989532.0, + "step": 17018 + }, + { + "epoch": 1.868987480781902, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.395589828491211, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7073795199394226, + "num_tokens": 440016373.0, + "step": 17019 + }, + { + "epoch": 1.8690972984845158, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2621142864227295, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7297857999801636, + "num_tokens": 440041652.0, + "step": 17020 + }, + { + "epoch": 1.8692071161871293, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.441732883453369, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7058773040771484, + "num_tokens": 440065463.0, + "step": 17021 + }, + { + "epoch": 1.869316933889743, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.348889112472534, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6940574645996094, + "num_tokens": 440092608.0, + "step": 17022 + }, + { + "epoch": 1.8694267515923566, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.133115768432617, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7095314264297485, + "num_tokens": 440122046.0, + "step": 17023 + }, + { + "epoch": 1.8695365692949704, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.104224681854248, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7195841073989868, + "num_tokens": 440152237.0, + "step": 17024 + }, + { + "epoch": 1.8696463869975841, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.307651996612549, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6966704726219177, + "num_tokens": 440179011.0, + "step": 17025 + }, + { + "epoch": 1.8697562047001977, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3785526752471924, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7216736078262329, + "num_tokens": 440202428.0, + "step": 17026 + }, + { + "epoch": 1.8698660224028112, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.49057674407959, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6938021779060364, + "num_tokens": 440228411.0, + "step": 17027 + }, + { + "epoch": 1.869975840105425, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.35507869720459, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7177228927612305, + "num_tokens": 440253868.0, + "step": 17028 + }, + { + "epoch": 1.8700856578080387, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4135830402374268, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7213417291641235, + "num_tokens": 440276775.0, + "step": 17029 + }, + { + "epoch": 1.8701954755106525, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.250176191329956, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7036151885986328, + "num_tokens": 440304245.0, + "step": 17030 + }, + { + "epoch": 1.870305293213266, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4848551750183105, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7169198989868164, + "num_tokens": 440328836.0, + "step": 17031 + }, + { + "epoch": 1.8704151109158795, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5425682067871094, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7166193127632141, + "num_tokens": 440350971.0, + "step": 17032 + }, + { + "epoch": 1.8705249286184933, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.507709503173828, + "learning_rate": 1e-06, + "loss": 1.1249, + "mean_token_accuracy": 0.6688961982727051, + "num_tokens": 440377766.0, + "step": 17033 + }, + { + "epoch": 1.870634746321107, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.177135705947876, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7293257713317871, + "num_tokens": 440407060.0, + "step": 17034 + }, + { + "epoch": 1.8707445640237206, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2362000942230225, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6951236724853516, + "num_tokens": 440433321.0, + "step": 17035 + }, + { + "epoch": 1.8708543817263343, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2475008964538574, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7139079570770264, + "num_tokens": 440461191.0, + "step": 17036 + }, + { + "epoch": 1.8709641994289479, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.416954755783081, + "learning_rate": 1e-06, + "loss": 1.0804, + "mean_token_accuracy": 0.687829315662384, + "num_tokens": 440485817.0, + "step": 17037 + }, + { + "epoch": 1.8710740171315616, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.183922529220581, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7070460915565491, + "num_tokens": 440512880.0, + "step": 17038 + }, + { + "epoch": 1.8711838348341754, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.125807046890259, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7191802263259888, + "num_tokens": 440543790.0, + "step": 17039 + }, + { + "epoch": 1.871293652536789, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4413580894470215, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7017658352851868, + "num_tokens": 440568853.0, + "step": 17040 + }, + { + "epoch": 1.8714034702394025, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1385395526885986, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6939022541046143, + "num_tokens": 440599282.0, + "step": 17041 + }, + { + "epoch": 1.8715132879420162, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1496782302856445, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7406400442123413, + "num_tokens": 440629140.0, + "step": 17042 + }, + { + "epoch": 1.87162310564463, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.413180112838745, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7121174335479736, + "num_tokens": 440653455.0, + "step": 17043 + }, + { + "epoch": 1.8717329233472437, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2627017498016357, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7148975729942322, + "num_tokens": 440680585.0, + "step": 17044 + }, + { + "epoch": 1.8718427410498573, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5365281105041504, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7179456353187561, + "num_tokens": 440703091.0, + "step": 17045 + }, + { + "epoch": 1.8719525587524708, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3619601726531982, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7134590744972229, + "num_tokens": 440729516.0, + "step": 17046 + }, + { + "epoch": 1.8720623764550846, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.104339361190796, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.718474805355072, + "num_tokens": 440764333.0, + "step": 17047 + }, + { + "epoch": 1.8721721941576983, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.187519073486328, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6948880553245544, + "num_tokens": 440794112.0, + "step": 17048 + }, + { + "epoch": 1.8722820118603118, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4573886394500732, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7284221053123474, + "num_tokens": 440816068.0, + "step": 17049 + }, + { + "epoch": 1.8723918295629254, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.511251926422119, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7222678661346436, + "num_tokens": 440839606.0, + "step": 17050 + }, + { + "epoch": 1.8725016472655391, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3155059814453125, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6989523768424988, + "num_tokens": 440867155.0, + "step": 17051 + }, + { + "epoch": 1.872611464968153, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2341530323028564, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7082163691520691, + "num_tokens": 440892677.0, + "step": 17052 + }, + { + "epoch": 1.8727212826707667, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3077263832092285, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6991299390792847, + "num_tokens": 440920143.0, + "step": 17053 + }, + { + "epoch": 1.8728311003733802, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.545118808746338, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7574010491371155, + "num_tokens": 440940554.0, + "step": 17054 + }, + { + "epoch": 1.8729409180759937, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.169520139694214, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7244186997413635, + "num_tokens": 440971347.0, + "step": 17055 + }, + { + "epoch": 1.8730507357786075, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4173474311828613, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7046851515769958, + "num_tokens": 440997428.0, + "step": 17056 + }, + { + "epoch": 1.8731605534812212, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.56410551071167, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.721852719783783, + "num_tokens": 441020541.0, + "step": 17057 + }, + { + "epoch": 1.873270371183835, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.428311347961426, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7490686178207397, + "num_tokens": 441044714.0, + "step": 17058 + }, + { + "epoch": 1.8733801888864485, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.488612651824951, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7228279709815979, + "num_tokens": 441068532.0, + "step": 17059 + }, + { + "epoch": 1.873490006589062, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.707301139831543, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7409111261367798, + "num_tokens": 441091202.0, + "step": 17060 + }, + { + "epoch": 1.8735998242916758, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.681675672531128, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7058694958686829, + "num_tokens": 441112498.0, + "step": 17061 + }, + { + "epoch": 1.8737096419942896, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2322723865509033, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7011502385139465, + "num_tokens": 441142109.0, + "step": 17062 + }, + { + "epoch": 1.8738194596969031, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.8571317195892334, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6932427883148193, + "num_tokens": 441164003.0, + "step": 17063 + }, + { + "epoch": 1.8739292773995166, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3887975215911865, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.685288667678833, + "num_tokens": 441189905.0, + "step": 17064 + }, + { + "epoch": 1.8740390951021304, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4499523639678955, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7135857343673706, + "num_tokens": 441215805.0, + "step": 17065 + }, + { + "epoch": 1.8741489128047442, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.280932664871216, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6989944577217102, + "num_tokens": 441246601.0, + "step": 17066 + }, + { + "epoch": 1.874258730507358, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2592830657958984, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7207146883010864, + "num_tokens": 441274797.0, + "step": 17067 + }, + { + "epoch": 1.8743685482099715, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.383648633956909, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6973710656166077, + "num_tokens": 441301370.0, + "step": 17068 + }, + { + "epoch": 1.874478365912585, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.588738203048706, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7152307629585266, + "num_tokens": 441323660.0, + "step": 17069 + }, + { + "epoch": 1.8745881836151987, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2484946250915527, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7010511159896851, + "num_tokens": 441352209.0, + "step": 17070 + }, + { + "epoch": 1.8746980013178125, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3143129348754883, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7097745537757874, + "num_tokens": 441383721.0, + "step": 17071 + }, + { + "epoch": 1.8748078190204263, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.440948009490967, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7023929357528687, + "num_tokens": 441409299.0, + "step": 17072 + }, + { + "epoch": 1.8749176367230398, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3934438228607178, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7094409465789795, + "num_tokens": 441434302.0, + "step": 17073 + }, + { + "epoch": 1.8750274544256533, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2943379878997803, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7126033306121826, + "num_tokens": 441461110.0, + "step": 17074 + }, + { + "epoch": 1.875137272128267, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.06984806060791, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7093281745910645, + "num_tokens": 441493487.0, + "step": 17075 + }, + { + "epoch": 1.8752470898308808, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.360844612121582, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7362351417541504, + "num_tokens": 441518779.0, + "step": 17076 + }, + { + "epoch": 1.8753569075334944, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.437298536300659, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7320046424865723, + "num_tokens": 441542646.0, + "step": 17077 + }, + { + "epoch": 1.875466725236108, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.061323642730713, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7158675789833069, + "num_tokens": 441573331.0, + "step": 17078 + }, + { + "epoch": 1.8755765429387217, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.214139699935913, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7237322330474854, + "num_tokens": 441602259.0, + "step": 17079 + }, + { + "epoch": 1.8756863606413354, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.138585329055786, + "learning_rate": 1e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.6876556873321533, + "num_tokens": 441634513.0, + "step": 17080 + }, + { + "epoch": 1.8757961783439492, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5665199756622314, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7185657024383545, + "num_tokens": 441657125.0, + "step": 17081 + }, + { + "epoch": 1.8759059960465627, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3881173133850098, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7036982774734497, + "num_tokens": 441683874.0, + "step": 17082 + }, + { + "epoch": 1.8760158137491763, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4350762367248535, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6881675124168396, + "num_tokens": 441710732.0, + "step": 17083 + }, + { + "epoch": 1.87612563145179, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4596095085144043, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7229455709457397, + "num_tokens": 441736131.0, + "step": 17084 + }, + { + "epoch": 1.8762354491544038, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6002449989318848, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7067073583602905, + "num_tokens": 441758858.0, + "step": 17085 + }, + { + "epoch": 1.8763452668570173, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.5601937770843506, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7198067903518677, + "num_tokens": 441782241.0, + "step": 17086 + }, + { + "epoch": 1.876455084559631, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5480566024780273, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.72089684009552, + "num_tokens": 441805904.0, + "step": 17087 + }, + { + "epoch": 1.8765649022622446, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.805431365966797, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7199124693870544, + "num_tokens": 441826314.0, + "step": 17088 + }, + { + "epoch": 1.8766747199648584, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.602944850921631, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7094194889068604, + "num_tokens": 441852105.0, + "step": 17089 + }, + { + "epoch": 1.876784537667472, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.200659990310669, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.703383207321167, + "num_tokens": 441882518.0, + "step": 17090 + }, + { + "epoch": 1.8768943553700856, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.7169382572174072, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.698417067527771, + "num_tokens": 441905581.0, + "step": 17091 + }, + { + "epoch": 1.8770041730726992, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.901163101196289, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7379878759384155, + "num_tokens": 441921785.0, + "step": 17092 + }, + { + "epoch": 1.877113990775313, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.509695053100586, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7249141931533813, + "num_tokens": 441944956.0, + "step": 17093 + }, + { + "epoch": 1.8772238084779267, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6091370582580566, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.717577338218689, + "num_tokens": 441967212.0, + "step": 17094 + }, + { + "epoch": 1.8773336261805404, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2222092151641846, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.705172061920166, + "num_tokens": 441997960.0, + "step": 17095 + }, + { + "epoch": 1.877443443883154, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4360909461975098, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7034959197044373, + "num_tokens": 442022187.0, + "step": 17096 + }, + { + "epoch": 1.8775532615857675, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.3261351585388184, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6907117366790771, + "num_tokens": 442050708.0, + "step": 17097 + }, + { + "epoch": 1.8776630792883813, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3956210613250732, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7361609935760498, + "num_tokens": 442075492.0, + "step": 17098 + }, + { + "epoch": 1.877772896990995, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.34635066986084, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6920626163482666, + "num_tokens": 442102548.0, + "step": 17099 + }, + { + "epoch": 1.8778827146936086, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.305312156677246, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.7014120817184448, + "num_tokens": 442130805.0, + "step": 17100 + }, + { + "epoch": 1.8779925323962223, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.524549961090088, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7028701901435852, + "num_tokens": 442154279.0, + "step": 17101 + }, + { + "epoch": 1.8781023500988359, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.335650682449341, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7061830759048462, + "num_tokens": 442182025.0, + "step": 17102 + }, + { + "epoch": 1.8782121678014496, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.426924228668213, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7238156795501709, + "num_tokens": 442205503.0, + "step": 17103 + }, + { + "epoch": 1.8783219855040634, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4915506839752197, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7234320640563965, + "num_tokens": 442228249.0, + "step": 17104 + }, + { + "epoch": 1.878431803206677, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.63889479637146, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7085736989974976, + "num_tokens": 442249551.0, + "step": 17105 + }, + { + "epoch": 1.8785416209092904, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5535178184509277, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7230559587478638, + "num_tokens": 442271194.0, + "step": 17106 + }, + { + "epoch": 1.8786514386119042, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2519893646240234, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7140804529190063, + "num_tokens": 442300042.0, + "step": 17107 + }, + { + "epoch": 1.878761256314518, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.341783285140991, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.69713294506073, + "num_tokens": 442328394.0, + "step": 17108 + }, + { + "epoch": 1.8788710740171317, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.0812227725982666, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6913843154907227, + "num_tokens": 442360783.0, + "step": 17109 + }, + { + "epoch": 1.8789808917197452, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4800233840942383, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.709062397480011, + "num_tokens": 442385450.0, + "step": 17110 + }, + { + "epoch": 1.8790907094223588, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3981149196624756, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7039231657981873, + "num_tokens": 442410895.0, + "step": 17111 + }, + { + "epoch": 1.8792005271249725, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4722583293914795, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7286977171897888, + "num_tokens": 442433978.0, + "step": 17112 + }, + { + "epoch": 1.8793103448275863, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.408677339553833, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7069063782691956, + "num_tokens": 442459487.0, + "step": 17113 + }, + { + "epoch": 1.8794201625301998, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3271305561065674, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7071601152420044, + "num_tokens": 442488748.0, + "step": 17114 + }, + { + "epoch": 1.8795299802328134, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3262739181518555, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7303739786148071, + "num_tokens": 442513277.0, + "step": 17115 + }, + { + "epoch": 1.8796397979354271, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4363560676574707, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7356551885604858, + "num_tokens": 442535248.0, + "step": 17116 + }, + { + "epoch": 1.8797496156380409, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1977109909057617, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7087773084640503, + "num_tokens": 442565238.0, + "step": 17117 + }, + { + "epoch": 1.8798594333406546, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6002800464630127, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7232669591903687, + "num_tokens": 442592048.0, + "step": 17118 + }, + { + "epoch": 1.8799692510432682, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.387697696685791, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.697153627872467, + "num_tokens": 442619496.0, + "step": 17119 + }, + { + "epoch": 1.8800790687458817, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.353208303451538, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7052056789398193, + "num_tokens": 442644476.0, + "step": 17120 + }, + { + "epoch": 1.8801888864484955, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.395900249481201, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6974704265594482, + "num_tokens": 442670967.0, + "step": 17121 + }, + { + "epoch": 1.8802987041511092, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.4927234649658203, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7103557586669922, + "num_tokens": 442694055.0, + "step": 17122 + }, + { + "epoch": 1.880408521853723, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3877482414245605, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6888445615768433, + "num_tokens": 442719608.0, + "step": 17123 + }, + { + "epoch": 1.8805183395563365, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.257249593734741, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7091619372367859, + "num_tokens": 442745522.0, + "step": 17124 + }, + { + "epoch": 1.88062815725895, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1177010536193848, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.735525369644165, + "num_tokens": 442774483.0, + "step": 17125 + }, + { + "epoch": 1.8807379749615638, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.415602207183838, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7130874991416931, + "num_tokens": 442799053.0, + "step": 17126 + }, + { + "epoch": 1.8808477926641776, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.182990550994873, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6926263570785522, + "num_tokens": 442830422.0, + "step": 17127 + }, + { + "epoch": 1.880957610366791, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.288480043411255, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6972476840019226, + "num_tokens": 442858114.0, + "step": 17128 + }, + { + "epoch": 1.8810674280694046, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3364644050598145, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6883243322372437, + "num_tokens": 442885656.0, + "step": 17129 + }, + { + "epoch": 1.8811772457720184, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.072918176651001, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.727041482925415, + "num_tokens": 442915149.0, + "step": 17130 + }, + { + "epoch": 1.8812870634746321, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.395425319671631, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.710379421710968, + "num_tokens": 442940307.0, + "step": 17131 + }, + { + "epoch": 1.881396881177246, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.192582130432129, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.705409586429596, + "num_tokens": 442971327.0, + "step": 17132 + }, + { + "epoch": 1.8815066988798594, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6224756240844727, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7139558792114258, + "num_tokens": 442993456.0, + "step": 17133 + }, + { + "epoch": 1.881616516582473, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.2610580921173096, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7011970281600952, + "num_tokens": 443022173.0, + "step": 17134 + }, + { + "epoch": 1.8817263342850867, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3057007789611816, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7233424186706543, + "num_tokens": 443048031.0, + "step": 17135 + }, + { + "epoch": 1.8818361519877005, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 2.6957788467407227, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7113306522369385, + "num_tokens": 443068998.0, + "step": 17136 + }, + { + "epoch": 1.881945969690314, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6112537384033203, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6969950199127197, + "num_tokens": 443090374.0, + "step": 17137 + }, + { + "epoch": 1.8820557873929278, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.34621262550354, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7218514680862427, + "num_tokens": 443116477.0, + "step": 17138 + }, + { + "epoch": 1.8821656050955413, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.463101863861084, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7348027229309082, + "num_tokens": 443142580.0, + "step": 17139 + }, + { + "epoch": 1.882275422798155, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6371095180511475, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7083152532577515, + "num_tokens": 443165982.0, + "step": 17140 + }, + { + "epoch": 1.8823852405007688, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1603329181671143, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7200794219970703, + "num_tokens": 443197490.0, + "step": 17141 + }, + { + "epoch": 1.8824950582033824, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.780776023864746, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7402179837226868, + "num_tokens": 443216578.0, + "step": 17142 + }, + { + "epoch": 1.882604875905996, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1838254928588867, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7250710725784302, + "num_tokens": 443246303.0, + "step": 17143 + }, + { + "epoch": 1.8827146936086097, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.381348133087158, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7059604525566101, + "num_tokens": 443272511.0, + "step": 17144 + }, + { + "epoch": 1.8828245113112234, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.333083391189575, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7033416032791138, + "num_tokens": 443297898.0, + "step": 17145 + }, + { + "epoch": 1.8829343290138372, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5490829944610596, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7174674868583679, + "num_tokens": 443320756.0, + "step": 17146 + }, + { + "epoch": 1.8830441467164507, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.513206958770752, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7051506042480469, + "num_tokens": 443344590.0, + "step": 17147 + }, + { + "epoch": 1.8831539644190642, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3908162117004395, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6984106302261353, + "num_tokens": 443372794.0, + "step": 17148 + }, + { + "epoch": 1.883263782121678, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3108065128326416, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6957937479019165, + "num_tokens": 443401270.0, + "step": 17149 + }, + { + "epoch": 1.8833735998242918, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1779212951660156, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7045261859893799, + "num_tokens": 443432763.0, + "step": 17150 + }, + { + "epoch": 1.8834834175269053, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.1906237602233887, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6957541704177856, + "num_tokens": 443462146.0, + "step": 17151 + }, + { + "epoch": 1.883593235229519, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.298051357269287, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7296547889709473, + "num_tokens": 443489265.0, + "step": 17152 + }, + { + "epoch": 1.8837030529321326, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2306785583496094, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7023963332176208, + "num_tokens": 443521043.0, + "step": 17153 + }, + { + "epoch": 1.8838128706347463, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2663912773132324, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7120547294616699, + "num_tokens": 443548868.0, + "step": 17154 + }, + { + "epoch": 1.88392268833736, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3721439838409424, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7503798007965088, + "num_tokens": 443572380.0, + "step": 17155 + }, + { + "epoch": 1.8840325060399736, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.397167444229126, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7146958708763123, + "num_tokens": 443595791.0, + "step": 17156 + }, + { + "epoch": 1.8841423237425872, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.256279706954956, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7005257606506348, + "num_tokens": 443623415.0, + "step": 17157 + }, + { + "epoch": 1.884252141445201, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.2372167110443115, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7047456502914429, + "num_tokens": 443653345.0, + "step": 17158 + }, + { + "epoch": 1.8843619591478147, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2718896865844727, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7216119766235352, + "num_tokens": 443681577.0, + "step": 17159 + }, + { + "epoch": 1.8844717768504284, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.3578178882598877, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7144196629524231, + "num_tokens": 443707466.0, + "step": 17160 + }, + { + "epoch": 1.884581594553042, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.439875602722168, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7076421976089478, + "num_tokens": 443730996.0, + "step": 17161 + }, + { + "epoch": 1.8846914122556555, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.526566505432129, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7015368342399597, + "num_tokens": 443754314.0, + "step": 17162 + }, + { + "epoch": 1.8848012299582693, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.6605277061462402, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7214713096618652, + "num_tokens": 443775221.0, + "step": 17163 + }, + { + "epoch": 1.884911047660883, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.5524539947509766, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6887882351875305, + "num_tokens": 443799455.0, + "step": 17164 + }, + { + "epoch": 1.8850208653634966, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.486846685409546, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6948415040969849, + "num_tokens": 443823852.0, + "step": 17165 + }, + { + "epoch": 1.88513068306611, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.36134934425354, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6891320943832397, + "num_tokens": 443850175.0, + "step": 17166 + }, + { + "epoch": 1.8852405007687238, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5263638496398926, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7236998081207275, + "num_tokens": 443874583.0, + "step": 17167 + }, + { + "epoch": 1.8853503184713376, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 2.7123775482177734, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.726443886756897, + "num_tokens": 443894730.0, + "step": 17168 + }, + { + "epoch": 1.8854601361739514, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7421915531158447, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7363510131835938, + "num_tokens": 443914815.0, + "step": 17169 + }, + { + "epoch": 1.885569953876565, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.167886734008789, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6861178874969482, + "num_tokens": 443946817.0, + "step": 17170 + }, + { + "epoch": 1.8856797715791784, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.418646812438965, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7034742832183838, + "num_tokens": 443972576.0, + "step": 17171 + }, + { + "epoch": 1.8857895892817922, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3354268074035645, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7203296422958374, + "num_tokens": 443997637.0, + "step": 17172 + }, + { + "epoch": 1.885899406984406, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.8210980892181396, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7147210836410522, + "num_tokens": 444016602.0, + "step": 17173 + }, + { + "epoch": 1.8860092246870197, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3318538665771484, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7182332873344421, + "num_tokens": 444042596.0, + "step": 17174 + }, + { + "epoch": 1.8861190423896332, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.109663963317871, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7300491333007812, + "num_tokens": 444071027.0, + "step": 17175 + }, + { + "epoch": 1.8862288600922468, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.371764659881592, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6963006258010864, + "num_tokens": 444097511.0, + "step": 17176 + }, + { + "epoch": 1.8863386777948605, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3640778064727783, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.7026396989822388, + "num_tokens": 444123105.0, + "step": 17177 + }, + { + "epoch": 1.8864484954974743, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2433063983917236, + "learning_rate": 1e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6889881491661072, + "num_tokens": 444152880.0, + "step": 17178 + }, + { + "epoch": 1.8865583132000878, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.401425361633301, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6989591717720032, + "num_tokens": 444177159.0, + "step": 17179 + }, + { + "epoch": 1.8866681309027014, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.215116262435913, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7198356986045837, + "num_tokens": 444204765.0, + "step": 17180 + }, + { + "epoch": 1.8867779486053151, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.0780253410339355, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6976470947265625, + "num_tokens": 444240019.0, + "step": 17181 + }, + { + "epoch": 1.8868877663079289, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3225648403167725, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7161960601806641, + "num_tokens": 444265462.0, + "step": 17182 + }, + { + "epoch": 1.8869975840105426, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3667571544647217, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7357966303825378, + "num_tokens": 444290496.0, + "step": 17183 + }, + { + "epoch": 1.8871074017131562, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2571043968200684, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7244831919670105, + "num_tokens": 444316974.0, + "step": 17184 + }, + { + "epoch": 1.8872172194157697, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1723639965057373, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7086405158042908, + "num_tokens": 444347182.0, + "step": 17185 + }, + { + "epoch": 1.8873270371183835, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.48730206489563, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7002001404762268, + "num_tokens": 444373363.0, + "step": 17186 + }, + { + "epoch": 1.8874368548209972, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.715893507003784, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6988707184791565, + "num_tokens": 444395418.0, + "step": 17187 + }, + { + "epoch": 1.887546672523611, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.211047649383545, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7331515550613403, + "num_tokens": 444423375.0, + "step": 17188 + }, + { + "epoch": 1.8876564902262245, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.59335994720459, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7192601561546326, + "num_tokens": 444444920.0, + "step": 17189 + }, + { + "epoch": 1.887766307928838, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3126060962677, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.699176013469696, + "num_tokens": 444474529.0, + "step": 17190 + }, + { + "epoch": 1.8878761256314518, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4350385665893555, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6986588835716248, + "num_tokens": 444500093.0, + "step": 17191 + }, + { + "epoch": 1.8879859433340656, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4063196182250977, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6977732181549072, + "num_tokens": 444527583.0, + "step": 17192 + }, + { + "epoch": 1.888095761036679, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.377694606781006, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7380779981613159, + "num_tokens": 444554468.0, + "step": 17193 + }, + { + "epoch": 1.8882055787392926, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.623469591140747, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7021660804748535, + "num_tokens": 444576911.0, + "step": 17194 + }, + { + "epoch": 1.8883153964419064, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3295469284057617, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.699355959892273, + "num_tokens": 444606474.0, + "step": 17195 + }, + { + "epoch": 1.8884252141445201, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.30959415435791, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.687494158744812, + "num_tokens": 444634777.0, + "step": 17196 + }, + { + "epoch": 1.888535031847134, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3467395305633545, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7081222534179688, + "num_tokens": 444659687.0, + "step": 17197 + }, + { + "epoch": 1.8886448495497474, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4516165256500244, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7253372669219971, + "num_tokens": 444684014.0, + "step": 17198 + }, + { + "epoch": 1.888754667252361, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6539394855499268, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6947630643844604, + "num_tokens": 444706289.0, + "step": 17199 + }, + { + "epoch": 1.8888644849549747, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.616140365600586, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.709837794303894, + "num_tokens": 444727952.0, + "step": 17200 + }, + { + "epoch": 1.8889743026575885, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.217686891555786, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7110757231712341, + "num_tokens": 444757089.0, + "step": 17201 + }, + { + "epoch": 1.889084120360202, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.123657703399658, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6937057375907898, + "num_tokens": 444787948.0, + "step": 17202 + }, + { + "epoch": 1.8891939380628158, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1314055919647217, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6834158301353455, + "num_tokens": 444821121.0, + "step": 17203 + }, + { + "epoch": 1.8893037557654293, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5737128257751465, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7100927233695984, + "num_tokens": 444843954.0, + "step": 17204 + }, + { + "epoch": 1.889413573468043, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6298274993896484, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6951017379760742, + "num_tokens": 444865200.0, + "step": 17205 + }, + { + "epoch": 1.8895233911706568, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.453472137451172, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.715506911277771, + "num_tokens": 444888083.0, + "step": 17206 + }, + { + "epoch": 1.8896332088732704, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.392697334289551, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7308498620986938, + "num_tokens": 444912017.0, + "step": 17207 + }, + { + "epoch": 1.8897430265758839, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5915579795837402, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7259806394577026, + "num_tokens": 444934303.0, + "step": 17208 + }, + { + "epoch": 1.8898528442784976, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5923237800598145, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.715715765953064, + "num_tokens": 444956792.0, + "step": 17209 + }, + { + "epoch": 1.8899626619811114, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6153995990753174, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.714835524559021, + "num_tokens": 444978512.0, + "step": 17210 + }, + { + "epoch": 1.8900724796837252, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3208417892456055, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7070437073707581, + "num_tokens": 445007847.0, + "step": 17211 + }, + { + "epoch": 1.8901822973863387, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3414363861083984, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7060990929603577, + "num_tokens": 445034565.0, + "step": 17212 + }, + { + "epoch": 1.8902921150889522, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.160118579864502, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6976690292358398, + "num_tokens": 445066671.0, + "step": 17213 + }, + { + "epoch": 1.890401932791566, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.0760600566864014, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6971654891967773, + "num_tokens": 445100563.0, + "step": 17214 + }, + { + "epoch": 1.8905117504941797, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2618350982666016, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7213158011436462, + "num_tokens": 445129719.0, + "step": 17215 + }, + { + "epoch": 1.8906215681967933, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.682600736618042, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7014854550361633, + "num_tokens": 445152273.0, + "step": 17216 + }, + { + "epoch": 1.890731385899407, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4807772636413574, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7211309671401978, + "num_tokens": 445176937.0, + "step": 17217 + }, + { + "epoch": 1.8908412036020206, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.420257091522217, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6895251274108887, + "num_tokens": 445207189.0, + "step": 17218 + }, + { + "epoch": 1.8909510213046343, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.532146692276001, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7104270458221436, + "num_tokens": 445229436.0, + "step": 17219 + }, + { + "epoch": 1.891060839007248, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.435852289199829, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7111831903457642, + "num_tokens": 445257816.0, + "step": 17220 + }, + { + "epoch": 1.8911706567098616, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.412247896194458, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7044452428817749, + "num_tokens": 445282658.0, + "step": 17221 + }, + { + "epoch": 1.8912804744124752, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.291307210922241, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7011227607727051, + "num_tokens": 445309648.0, + "step": 17222 + }, + { + "epoch": 1.891390292115089, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.370772123336792, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7111366987228394, + "num_tokens": 445336103.0, + "step": 17223 + }, + { + "epoch": 1.8915001098177027, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.204822540283203, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7097691297531128, + "num_tokens": 445366873.0, + "step": 17224 + }, + { + "epoch": 1.8916099275203164, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.402489423751831, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.692305326461792, + "num_tokens": 445394047.0, + "step": 17225 + }, + { + "epoch": 1.89171974522293, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5828912258148193, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.703773021697998, + "num_tokens": 445418118.0, + "step": 17226 + }, + { + "epoch": 1.8918295629255435, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2741663455963135, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7081515789031982, + "num_tokens": 445446647.0, + "step": 17227 + }, + { + "epoch": 1.8919393806281573, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4448204040527344, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7176036834716797, + "num_tokens": 445470464.0, + "step": 17228 + }, + { + "epoch": 1.892049198330771, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.718435764312744, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7283821702003479, + "num_tokens": 445492287.0, + "step": 17229 + }, + { + "epoch": 1.8921590160333845, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.542362689971924, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.7033323049545288, + "num_tokens": 445516898.0, + "step": 17230 + }, + { + "epoch": 1.892268833735998, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5505754947662354, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7023626565933228, + "num_tokens": 445541226.0, + "step": 17231 + }, + { + "epoch": 1.8923786514386118, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.505728006362915, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7198793888092041, + "num_tokens": 445564130.0, + "step": 17232 + }, + { + "epoch": 1.8924884691412256, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3426880836486816, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7210132479667664, + "num_tokens": 445589156.0, + "step": 17233 + }, + { + "epoch": 1.8925982868438394, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.454453468322754, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7088419795036316, + "num_tokens": 445613269.0, + "step": 17234 + }, + { + "epoch": 1.8927081045464529, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2310245037078857, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6971009969711304, + "num_tokens": 445642926.0, + "step": 17235 + }, + { + "epoch": 1.8928179222490664, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.538835048675537, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7411171793937683, + "num_tokens": 445663012.0, + "step": 17236 + }, + { + "epoch": 1.8929277399516802, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.307798147201538, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7158075571060181, + "num_tokens": 445690161.0, + "step": 17237 + }, + { + "epoch": 1.893037557654294, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3745462894439697, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6981860995292664, + "num_tokens": 445716987.0, + "step": 17238 + }, + { + "epoch": 1.8931473753569077, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3801028728485107, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7341008186340332, + "num_tokens": 445741219.0, + "step": 17239 + }, + { + "epoch": 1.8932571930595212, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4435107707977295, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6976750493049622, + "num_tokens": 445766150.0, + "step": 17240 + }, + { + "epoch": 1.8933670107621348, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3669111728668213, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6995826363563538, + "num_tokens": 445793915.0, + "step": 17241 + }, + { + "epoch": 1.8934768284647485, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.310159683227539, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.7006637454032898, + "num_tokens": 445819999.0, + "step": 17242 + }, + { + "epoch": 1.8935866461673623, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2746076583862305, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.7053580284118652, + "num_tokens": 445849330.0, + "step": 17243 + }, + { + "epoch": 1.8936964638699758, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.339926242828369, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6949416399002075, + "num_tokens": 445876150.0, + "step": 17244 + }, + { + "epoch": 1.8938062815725893, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.218627452850342, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7114452123641968, + "num_tokens": 445906024.0, + "step": 17245 + }, + { + "epoch": 1.893916099275203, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.237250566482544, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6933099031448364, + "num_tokens": 445934276.0, + "step": 17246 + }, + { + "epoch": 1.8940259169778169, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.357926368713379, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6956062316894531, + "num_tokens": 445961624.0, + "step": 17247 + }, + { + "epoch": 1.8941357346804306, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2874107360839844, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7149341106414795, + "num_tokens": 445991363.0, + "step": 17248 + }, + { + "epoch": 1.8942455523830442, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.529639482498169, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7021126747131348, + "num_tokens": 446014534.0, + "step": 17249 + }, + { + "epoch": 1.8943553700856577, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.659499406814575, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7273349761962891, + "num_tokens": 446036606.0, + "step": 17250 + }, + { + "epoch": 1.8944651877882714, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7022643089294434, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7144064903259277, + "num_tokens": 446056241.0, + "step": 17251 + }, + { + "epoch": 1.8945750054908852, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.9483039379119873, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7171431183815002, + "num_tokens": 446073179.0, + "step": 17252 + }, + { + "epoch": 1.894684823193499, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4680423736572266, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6957530975341797, + "num_tokens": 446102627.0, + "step": 17253 + }, + { + "epoch": 1.8947946408961125, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3974359035491943, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7163459062576294, + "num_tokens": 446129432.0, + "step": 17254 + }, + { + "epoch": 1.894904458598726, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4713237285614014, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7221956849098206, + "num_tokens": 446151081.0, + "step": 17255 + }, + { + "epoch": 1.8950142763013398, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5758159160614014, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7045090198516846, + "num_tokens": 446176354.0, + "step": 17256 + }, + { + "epoch": 1.8951240940039535, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.346898317337036, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7097698450088501, + "num_tokens": 446202852.0, + "step": 17257 + }, + { + "epoch": 1.895233911706567, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4614720344543457, + "learning_rate": 1e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6834677457809448, + "num_tokens": 446229433.0, + "step": 17258 + }, + { + "epoch": 1.8953437294091806, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.549952268600464, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7244458198547363, + "num_tokens": 446250044.0, + "step": 17259 + }, + { + "epoch": 1.8954535471117944, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.519139051437378, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.7008187174797058, + "num_tokens": 446274982.0, + "step": 17260 + }, + { + "epoch": 1.8955633648144081, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6530802249908447, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7178258895874023, + "num_tokens": 446296475.0, + "step": 17261 + }, + { + "epoch": 1.8956731825170219, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.097912073135376, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7012187838554382, + "num_tokens": 446329801.0, + "step": 17262 + }, + { + "epoch": 1.8957830002196354, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4442195892333984, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6942393779754639, + "num_tokens": 446354878.0, + "step": 17263 + }, + { + "epoch": 1.895892817922249, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.0953023433685303, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7101644277572632, + "num_tokens": 446388505.0, + "step": 17264 + }, + { + "epoch": 1.8960026356248627, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.63369083404541, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6994088888168335, + "num_tokens": 446409303.0, + "step": 17265 + }, + { + "epoch": 1.8961124533274765, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.239877700805664, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7044501900672913, + "num_tokens": 446437388.0, + "step": 17266 + }, + { + "epoch": 1.89622227103009, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1805505752563477, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6977536082267761, + "num_tokens": 446467633.0, + "step": 17267 + }, + { + "epoch": 1.8963320887327038, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3922946453094482, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7035965919494629, + "num_tokens": 446495235.0, + "step": 17268 + }, + { + "epoch": 1.8964419064353173, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.242748737335205, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6987079381942749, + "num_tokens": 446526089.0, + "step": 17269 + }, + { + "epoch": 1.896551724137931, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7345073223114014, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7147358655929565, + "num_tokens": 446546020.0, + "step": 17270 + }, + { + "epoch": 1.8966615418405448, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.235257863998413, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7155009508132935, + "num_tokens": 446573748.0, + "step": 17271 + }, + { + "epoch": 1.8967713595431583, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3988876342773438, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.722611129283905, + "num_tokens": 446600106.0, + "step": 17272 + }, + { + "epoch": 1.8968811772457719, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7543628215789795, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7226115465164185, + "num_tokens": 446621102.0, + "step": 17273 + }, + { + "epoch": 1.8969909949483856, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.410055637359619, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.6786648035049438, + "num_tokens": 446647799.0, + "step": 17274 + }, + { + "epoch": 1.8971008126509994, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3790998458862305, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.707821786403656, + "num_tokens": 446673387.0, + "step": 17275 + }, + { + "epoch": 1.8972106303536131, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.458157539367676, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7084862589836121, + "num_tokens": 446698085.0, + "step": 17276 + }, + { + "epoch": 1.8973204480562267, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.387856960296631, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6812840700149536, + "num_tokens": 446724016.0, + "step": 17277 + }, + { + "epoch": 1.8974302657588402, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.524552583694458, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7210906147956848, + "num_tokens": 446749350.0, + "step": 17278 + }, + { + "epoch": 1.897540083461454, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.210984945297241, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7173765897750854, + "num_tokens": 446779779.0, + "step": 17279 + }, + { + "epoch": 1.8976499011640677, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.34615159034729, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7222148776054382, + "num_tokens": 446805939.0, + "step": 17280 + }, + { + "epoch": 1.8977597188666813, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2705163955688477, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7065000534057617, + "num_tokens": 446837087.0, + "step": 17281 + }, + { + "epoch": 1.897869536569295, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3960580825805664, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7140370607376099, + "num_tokens": 446862282.0, + "step": 17282 + }, + { + "epoch": 1.8979793542719086, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1174211502075195, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7228592038154602, + "num_tokens": 446892472.0, + "step": 17283 + }, + { + "epoch": 1.8980891719745223, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3188576698303223, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.725666880607605, + "num_tokens": 446917216.0, + "step": 17284 + }, + { + "epoch": 1.898198989677136, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3414628505706787, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7139530181884766, + "num_tokens": 446942707.0, + "step": 17285 + }, + { + "epoch": 1.8983088073797496, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3277297019958496, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7087697982788086, + "num_tokens": 446970127.0, + "step": 17286 + }, + { + "epoch": 1.8984186250823631, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.829939365386963, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.702488362789154, + "num_tokens": 446990487.0, + "step": 17287 + }, + { + "epoch": 1.898528442784977, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.390073537826538, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7213379144668579, + "num_tokens": 447016971.0, + "step": 17288 + }, + { + "epoch": 1.8986382604875907, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1395390033721924, + "learning_rate": 1e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.6816915273666382, + "num_tokens": 447049599.0, + "step": 17289 + }, + { + "epoch": 1.8987480781902044, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2907426357269287, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7110121250152588, + "num_tokens": 447077909.0, + "step": 17290 + }, + { + "epoch": 1.898857895892818, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.363111972808838, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7368721961975098, + "num_tokens": 447102460.0, + "step": 17291 + }, + { + "epoch": 1.8989677135954315, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4050347805023193, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.715316653251648, + "num_tokens": 447127190.0, + "step": 17292 + }, + { + "epoch": 1.8990775312980452, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3770360946655273, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.726945698261261, + "num_tokens": 447152851.0, + "step": 17293 + }, + { + "epoch": 1.899187349000659, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3403215408325195, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7021749019622803, + "num_tokens": 447181968.0, + "step": 17294 + }, + { + "epoch": 1.8992971667032725, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5215885639190674, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.719788670539856, + "num_tokens": 447204277.0, + "step": 17295 + }, + { + "epoch": 1.899406984405886, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5239291191101074, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.723546028137207, + "num_tokens": 447226752.0, + "step": 17296 + }, + { + "epoch": 1.8995168021084998, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4876232147216797, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7393133640289307, + "num_tokens": 447248056.0, + "step": 17297 + }, + { + "epoch": 1.8996266198111136, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4668257236480713, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7016392946243286, + "num_tokens": 447273500.0, + "step": 17298 + }, + { + "epoch": 1.8997364375137273, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.23871111869812, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.709721565246582, + "num_tokens": 447302646.0, + "step": 17299 + }, + { + "epoch": 1.8998462552163409, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3567452430725098, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.711205780506134, + "num_tokens": 447328778.0, + "step": 17300 + }, + { + "epoch": 1.8999560729189544, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4260220527648926, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7129403948783875, + "num_tokens": 447354432.0, + "step": 17301 + }, + { + "epoch": 1.9000658906215682, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2592501640319824, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7204177379608154, + "num_tokens": 447381128.0, + "step": 17302 + }, + { + "epoch": 1.900175708324182, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.537799596786499, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7073923945426941, + "num_tokens": 447406027.0, + "step": 17303 + }, + { + "epoch": 1.9002855260267957, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2984228134155273, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7416340708732605, + "num_tokens": 447430713.0, + "step": 17304 + }, + { + "epoch": 1.9003953437294092, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.540083646774292, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7172679901123047, + "num_tokens": 447455069.0, + "step": 17305 + }, + { + "epoch": 1.9005051614320227, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2820820808410645, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7044854164123535, + "num_tokens": 447483421.0, + "step": 17306 + }, + { + "epoch": 1.9006149791346365, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1926496028900146, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7002742290496826, + "num_tokens": 447511348.0, + "step": 17307 + }, + { + "epoch": 1.9007247968372503, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.548652410507202, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7257262468338013, + "num_tokens": 447533441.0, + "step": 17308 + }, + { + "epoch": 1.9008346145398638, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3276190757751465, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7125279903411865, + "num_tokens": 447558387.0, + "step": 17309 + }, + { + "epoch": 1.9009444322424773, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.071998119354248, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6964592933654785, + "num_tokens": 447592242.0, + "step": 17310 + }, + { + "epoch": 1.901054249945091, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.519071578979492, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6944789886474609, + "num_tokens": 447615537.0, + "step": 17311 + }, + { + "epoch": 1.9011640676477048, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.116966724395752, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7175249457359314, + "num_tokens": 447645715.0, + "step": 17312 + }, + { + "epoch": 1.9012738853503186, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2062432765960693, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7230513095855713, + "num_tokens": 447674361.0, + "step": 17313 + }, + { + "epoch": 1.9013837030529321, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3315696716308594, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.7015490531921387, + "num_tokens": 447700851.0, + "step": 17314 + }, + { + "epoch": 1.9014935207555457, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2485463619232178, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7101062536239624, + "num_tokens": 447729638.0, + "step": 17315 + }, + { + "epoch": 1.9016033384581594, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1399285793304443, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6954096555709839, + "num_tokens": 447760890.0, + "step": 17316 + }, + { + "epoch": 1.9017131561607732, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1891043186187744, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7362821698188782, + "num_tokens": 447785658.0, + "step": 17317 + }, + { + "epoch": 1.9018229738633867, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3647098541259766, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6983159780502319, + "num_tokens": 447811159.0, + "step": 17318 + }, + { + "epoch": 1.9019327915660005, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2389092445373535, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7215163111686707, + "num_tokens": 447838044.0, + "step": 17319 + }, + { + "epoch": 1.902042609268614, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4145195484161377, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7223191261291504, + "num_tokens": 447865087.0, + "step": 17320 + }, + { + "epoch": 1.9021524269712278, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.367871046066284, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7040241360664368, + "num_tokens": 447890684.0, + "step": 17321 + }, + { + "epoch": 1.9022622446738415, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3824236392974854, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7161128520965576, + "num_tokens": 447915223.0, + "step": 17322 + }, + { + "epoch": 1.902372062376455, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.251124382019043, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6986119747161865, + "num_tokens": 447943562.0, + "step": 17323 + }, + { + "epoch": 1.9024818800790686, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.470109462738037, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7166588306427002, + "num_tokens": 447967291.0, + "step": 17324 + }, + { + "epoch": 1.9025916977816824, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3538737297058105, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7041078209877014, + "num_tokens": 447993169.0, + "step": 17325 + }, + { + "epoch": 1.9027015154842961, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2073280811309814, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.7016783952713013, + "num_tokens": 448021760.0, + "step": 17326 + }, + { + "epoch": 1.9028113331869099, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1480324268341064, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6878540515899658, + "num_tokens": 448054045.0, + "step": 17327 + }, + { + "epoch": 1.9029211508895234, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1554553508758545, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.713685154914856, + "num_tokens": 448083280.0, + "step": 17328 + }, + { + "epoch": 1.903030968592137, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 3.130295991897583, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7092963457107544, + "num_tokens": 448110152.0, + "step": 17329 + }, + { + "epoch": 1.9031407862947507, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1871988773345947, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6909558773040771, + "num_tokens": 448141465.0, + "step": 17330 + }, + { + "epoch": 1.9032506039973645, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 6.96401834487915, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7281887531280518, + "num_tokens": 448165574.0, + "step": 17331 + }, + { + "epoch": 1.903360421699978, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.294529438018799, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6906695365905762, + "num_tokens": 448193919.0, + "step": 17332 + }, + { + "epoch": 1.9034702394025917, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.326953649520874, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7188886404037476, + "num_tokens": 448219763.0, + "step": 17333 + }, + { + "epoch": 1.9035800571052053, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1604065895080566, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6858800053596497, + "num_tokens": 448251785.0, + "step": 17334 + }, + { + "epoch": 1.903689874807819, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3888373374938965, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7121930122375488, + "num_tokens": 448278044.0, + "step": 17335 + }, + { + "epoch": 1.9037996925104328, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3636205196380615, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7195385098457336, + "num_tokens": 448303866.0, + "step": 17336 + }, + { + "epoch": 1.9039095102130463, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1946945190429688, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7009572982788086, + "num_tokens": 448334191.0, + "step": 17337 + }, + { + "epoch": 1.9040193279156599, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4261131286621094, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7049546837806702, + "num_tokens": 448361400.0, + "step": 17338 + }, + { + "epoch": 1.9041291456182736, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.700793743133545, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.7019315958023071, + "num_tokens": 448381714.0, + "step": 17339 + }, + { + "epoch": 1.9042389633208874, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.352295398712158, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7046793103218079, + "num_tokens": 448407440.0, + "step": 17340 + }, + { + "epoch": 1.9043487810235011, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.597072124481201, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7143962383270264, + "num_tokens": 448429954.0, + "step": 17341 + }, + { + "epoch": 1.9044585987261147, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.371824026107788, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7284119129180908, + "num_tokens": 448455485.0, + "step": 17342 + }, + { + "epoch": 1.9045684164287282, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.471973180770874, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7334433794021606, + "num_tokens": 448480074.0, + "step": 17343 + }, + { + "epoch": 1.904678234131342, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.273890972137451, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7263492345809937, + "num_tokens": 448507473.0, + "step": 17344 + }, + { + "epoch": 1.9047880518339557, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.593331813812256, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7094714641571045, + "num_tokens": 448529085.0, + "step": 17345 + }, + { + "epoch": 1.9048978695365693, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6090404987335205, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7097020149230957, + "num_tokens": 448553224.0, + "step": 17346 + }, + { + "epoch": 1.9050076872391828, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4918975830078125, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6940642595291138, + "num_tokens": 448577920.0, + "step": 17347 + }, + { + "epoch": 1.9051175049417965, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.304178476333618, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7171701192855835, + "num_tokens": 448604254.0, + "step": 17348 + }, + { + "epoch": 1.9052273226444103, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5970346927642822, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7126950025558472, + "num_tokens": 448626666.0, + "step": 17349 + }, + { + "epoch": 1.905337140347024, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1188101768493652, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7005191445350647, + "num_tokens": 448661643.0, + "step": 17350 + }, + { + "epoch": 1.9054469580496376, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 8.598172187805176, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7062478065490723, + "num_tokens": 448688672.0, + "step": 17351 + }, + { + "epoch": 1.9055567757522511, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.9803738594055176, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7166985273361206, + "num_tokens": 448707192.0, + "step": 17352 + }, + { + "epoch": 1.9056665934548649, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.557891845703125, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6938109397888184, + "num_tokens": 448733459.0, + "step": 17353 + }, + { + "epoch": 1.9057764111574786, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 3.3989803791046143, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7243267893791199, + "num_tokens": 448758398.0, + "step": 17354 + }, + { + "epoch": 1.9058862288600924, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6345198154449463, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7077779769897461, + "num_tokens": 448779754.0, + "step": 17355 + }, + { + "epoch": 1.905996046562706, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.559502601623535, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6975886821746826, + "num_tokens": 448804077.0, + "step": 17356 + }, + { + "epoch": 1.9061058642653195, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7178866863250732, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7186694145202637, + "num_tokens": 448825247.0, + "step": 17357 + }, + { + "epoch": 1.9062156819679332, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.355102777481079, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7185774445533752, + "num_tokens": 448850877.0, + "step": 17358 + }, + { + "epoch": 1.906325499670547, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.583364963531494, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6993885040283203, + "num_tokens": 448873759.0, + "step": 17359 + }, + { + "epoch": 1.9064353173731605, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.270278215408325, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6936046481132507, + "num_tokens": 448902244.0, + "step": 17360 + }, + { + "epoch": 1.906545135075774, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.009748935699463, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7137774229049683, + "num_tokens": 448937166.0, + "step": 17361 + }, + { + "epoch": 1.9066549527783878, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3913960456848145, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7114062309265137, + "num_tokens": 448961982.0, + "step": 17362 + }, + { + "epoch": 1.9067647704810016, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2544898986816406, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6992175579071045, + "num_tokens": 448993537.0, + "step": 17363 + }, + { + "epoch": 1.9068745881836153, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.339651584625244, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7138398885726929, + "num_tokens": 449020416.0, + "step": 17364 + }, + { + "epoch": 1.9069844058862289, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.380141496658325, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7285520434379578, + "num_tokens": 449046315.0, + "step": 17365 + }, + { + "epoch": 1.9070942235888424, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4971704483032227, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6955520510673523, + "num_tokens": 449072172.0, + "step": 17366 + }, + { + "epoch": 1.9072040412914562, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5230798721313477, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7204581499099731, + "num_tokens": 449094238.0, + "step": 17367 + }, + { + "epoch": 1.90731385899407, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.466933250427246, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7077963352203369, + "num_tokens": 449118465.0, + "step": 17368 + }, + { + "epoch": 1.9074236766966837, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.614906072616577, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6973448991775513, + "num_tokens": 449140523.0, + "step": 17369 + }, + { + "epoch": 1.9075334943992972, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.378981113433838, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7005450129508972, + "num_tokens": 449167458.0, + "step": 17370 + }, + { + "epoch": 1.9076433121019107, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7136425971984863, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7205094695091248, + "num_tokens": 449187811.0, + "step": 17371 + }, + { + "epoch": 1.9077531298045245, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3144760131835938, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7299103140830994, + "num_tokens": 449212999.0, + "step": 17372 + }, + { + "epoch": 1.9078629475071383, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3739867210388184, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6920050978660583, + "num_tokens": 449238024.0, + "step": 17373 + }, + { + "epoch": 1.9079727652097518, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1879940032958984, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7097278833389282, + "num_tokens": 449266862.0, + "step": 17374 + }, + { + "epoch": 1.9080825829123653, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.169569730758667, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7241389155387878, + "num_tokens": 449296136.0, + "step": 17375 + }, + { + "epoch": 1.908192400614979, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4137988090515137, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6957305669784546, + "num_tokens": 449322698.0, + "step": 17376 + }, + { + "epoch": 1.9083022183175928, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2623844146728516, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.690707802772522, + "num_tokens": 449351229.0, + "step": 17377 + }, + { + "epoch": 1.9084120360202066, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4946253299713135, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7242387533187866, + "num_tokens": 449374457.0, + "step": 17378 + }, + { + "epoch": 1.9085218537228201, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.291327714920044, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7053882479667664, + "num_tokens": 449401738.0, + "step": 17379 + }, + { + "epoch": 1.9086316714254337, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2571773529052734, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7052749991416931, + "num_tokens": 449428375.0, + "step": 17380 + }, + { + "epoch": 1.9087414891280474, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.239987850189209, + "learning_rate": 1e-06, + "loss": 1.1398, + "mean_token_accuracy": 0.673929750919342, + "num_tokens": 449460367.0, + "step": 17381 + }, + { + "epoch": 1.9088513068306612, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.580361843109131, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7294784784317017, + "num_tokens": 449481939.0, + "step": 17382 + }, + { + "epoch": 1.9089611245332747, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.352814197540283, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7070627808570862, + "num_tokens": 449507779.0, + "step": 17383 + }, + { + "epoch": 1.9090709422358885, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.376382827758789, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7176367044448853, + "num_tokens": 449536049.0, + "step": 17384 + }, + { + "epoch": 1.909180759938502, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.635768175125122, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7286048531532288, + "num_tokens": 449556604.0, + "step": 17385 + }, + { + "epoch": 1.9092905776411158, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.240851640701294, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7145203351974487, + "num_tokens": 449583690.0, + "step": 17386 + }, + { + "epoch": 1.9094003953437295, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3636717796325684, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7252616286277771, + "num_tokens": 449609306.0, + "step": 17387 + }, + { + "epoch": 1.909510213046343, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.461768627166748, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7160183191299438, + "num_tokens": 449637102.0, + "step": 17388 + }, + { + "epoch": 1.9096200307489566, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.432816743850708, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7232834696769714, + "num_tokens": 449659902.0, + "step": 17389 + }, + { + "epoch": 1.9097298484515703, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3880910873413086, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7148836255073547, + "num_tokens": 449685570.0, + "step": 17390 + }, + { + "epoch": 1.909839666154184, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3675613403320312, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7172752618789673, + "num_tokens": 449709286.0, + "step": 17391 + }, + { + "epoch": 1.9099494838567979, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3145017623901367, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7053731679916382, + "num_tokens": 449735615.0, + "step": 17392 + }, + { + "epoch": 1.9100593015594114, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.264101266860962, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7025623321533203, + "num_tokens": 449762331.0, + "step": 17393 + }, + { + "epoch": 1.910169119262025, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1525371074676514, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7018890380859375, + "num_tokens": 449795113.0, + "step": 17394 + }, + { + "epoch": 1.9102789369646387, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3806424140930176, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7197136878967285, + "num_tokens": 449821302.0, + "step": 17395 + }, + { + "epoch": 1.9103887546672524, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.334799289703369, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7032434940338135, + "num_tokens": 449847719.0, + "step": 17396 + }, + { + "epoch": 1.910498572369866, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.356563091278076, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6896191835403442, + "num_tokens": 449874132.0, + "step": 17397 + }, + { + "epoch": 1.9106083900724797, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7871205806732178, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7209336757659912, + "num_tokens": 449892049.0, + "step": 17398 + }, + { + "epoch": 1.9107182077750933, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.29227876663208, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7080849409103394, + "num_tokens": 449918827.0, + "step": 17399 + }, + { + "epoch": 1.910828025477707, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7680675983428955, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7031223773956299, + "num_tokens": 449940306.0, + "step": 17400 + }, + { + "epoch": 1.9109378431803208, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.288041353225708, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6966577768325806, + "num_tokens": 449968701.0, + "step": 17401 + }, + { + "epoch": 1.9110476608829343, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.489027976989746, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6904100179672241, + "num_tokens": 449994837.0, + "step": 17402 + }, + { + "epoch": 1.9111574785855479, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3578245639801025, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6959060430526733, + "num_tokens": 450021966.0, + "step": 17403 + }, + { + "epoch": 1.9112672962881616, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.269902467727661, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6954232454299927, + "num_tokens": 450050687.0, + "step": 17404 + }, + { + "epoch": 1.9113771139907754, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.329883575439453, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7104775905609131, + "num_tokens": 450076238.0, + "step": 17405 + }, + { + "epoch": 1.9114869316933891, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6005828380584717, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7133398056030273, + "num_tokens": 450096935.0, + "step": 17406 + }, + { + "epoch": 1.9115967493960027, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.518653631210327, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.709364652633667, + "num_tokens": 450120989.0, + "step": 17407 + }, + { + "epoch": 1.9117065670986162, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.391512155532837, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7161942720413208, + "num_tokens": 450148570.0, + "step": 17408 + }, + { + "epoch": 1.91181638480123, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.04016375541687, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7505990266799927, + "num_tokens": 450180517.0, + "step": 17409 + }, + { + "epoch": 1.9119262025038437, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.562204599380493, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7249172329902649, + "num_tokens": 450204352.0, + "step": 17410 + }, + { + "epoch": 1.9120360202064572, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.710679292678833, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7114992141723633, + "num_tokens": 450223829.0, + "step": 17411 + }, + { + "epoch": 1.9121458379090708, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3893914222717285, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7038302421569824, + "num_tokens": 450249659.0, + "step": 17412 + }, + { + "epoch": 1.9122556556116845, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.57564115524292, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7322079539299011, + "num_tokens": 450271616.0, + "step": 17413 + }, + { + "epoch": 1.9123654733142983, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 7.033593654632568, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.732209324836731, + "num_tokens": 450293969.0, + "step": 17414 + }, + { + "epoch": 1.912475291016912, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.602224826812744, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.721305251121521, + "num_tokens": 450317278.0, + "step": 17415 + }, + { + "epoch": 1.9125851087195256, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5256195068359375, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6985922455787659, + "num_tokens": 450343184.0, + "step": 17416 + }, + { + "epoch": 1.9126949264221391, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.817702531814575, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7393993139266968, + "num_tokens": 450360937.0, + "step": 17417 + }, + { + "epoch": 1.9128047441247529, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3761799335479736, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7109425067901611, + "num_tokens": 450384911.0, + "step": 17418 + }, + { + "epoch": 1.9129145618273666, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.9471163749694824, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7366052865982056, + "num_tokens": 450403271.0, + "step": 17419 + }, + { + "epoch": 1.9130243795299804, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4310050010681152, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7323794364929199, + "num_tokens": 450425048.0, + "step": 17420 + }, + { + "epoch": 1.913134197232594, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2936296463012695, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7480099201202393, + "num_tokens": 450449798.0, + "step": 17421 + }, + { + "epoch": 1.9132440149352075, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.690080404281616, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7155148983001709, + "num_tokens": 450471716.0, + "step": 17422 + }, + { + "epoch": 1.9133538326378212, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 3.628291130065918, + "learning_rate": 1e-06, + "loss": 1.1079, + "mean_token_accuracy": 0.684657633304596, + "num_tokens": 450499535.0, + "step": 17423 + }, + { + "epoch": 1.913463650340435, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4090847969055176, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7127057313919067, + "num_tokens": 450523739.0, + "step": 17424 + }, + { + "epoch": 1.9135734680430485, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3119099140167236, + "learning_rate": 1e-06, + "loss": 1.1352, + "mean_token_accuracy": 0.6692250967025757, + "num_tokens": 450556393.0, + "step": 17425 + }, + { + "epoch": 1.913683285745662, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.506359100341797, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7181710004806519, + "num_tokens": 450577971.0, + "step": 17426 + }, + { + "epoch": 1.9137931034482758, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.237260580062866, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7005816102027893, + "num_tokens": 450609518.0, + "step": 17427 + }, + { + "epoch": 1.9139029211508896, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7792744636535645, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7304903268814087, + "num_tokens": 450629031.0, + "step": 17428 + }, + { + "epoch": 1.9140127388535033, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2653098106384277, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7154471278190613, + "num_tokens": 450656033.0, + "step": 17429 + }, + { + "epoch": 1.9141225565561168, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.394796371459961, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7018065452575684, + "num_tokens": 450681630.0, + "step": 17430 + }, + { + "epoch": 1.9142323742587304, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.723659038543701, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7409642934799194, + "num_tokens": 450701588.0, + "step": 17431 + }, + { + "epoch": 1.9143421919613441, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.501237392425537, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.721515953540802, + "num_tokens": 450724736.0, + "step": 17432 + }, + { + "epoch": 1.914452009663958, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4846601486206055, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7152001857757568, + "num_tokens": 450748266.0, + "step": 17433 + }, + { + "epoch": 1.9145618273665717, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2025656700134277, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7148683071136475, + "num_tokens": 450777544.0, + "step": 17434 + }, + { + "epoch": 1.9146716450691852, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4540934562683105, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7133616209030151, + "num_tokens": 450802881.0, + "step": 17435 + }, + { + "epoch": 1.9147814627717987, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4329495429992676, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6970744132995605, + "num_tokens": 450832711.0, + "step": 17436 + }, + { + "epoch": 1.9148912804744125, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2460923194885254, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7012293338775635, + "num_tokens": 450862156.0, + "step": 17437 + }, + { + "epoch": 1.9150010981770262, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.322355031967163, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6915997266769409, + "num_tokens": 450887857.0, + "step": 17438 + }, + { + "epoch": 1.9151109158796398, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 7.583683013916016, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7075464725494385, + "num_tokens": 450916891.0, + "step": 17439 + }, + { + "epoch": 1.9152207335822533, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2959089279174805, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6931260228157043, + "num_tokens": 450943925.0, + "step": 17440 + }, + { + "epoch": 1.915330551284867, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.528109550476074, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7407562732696533, + "num_tokens": 450964413.0, + "step": 17441 + }, + { + "epoch": 1.9154403689874808, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4311716556549072, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7106021642684937, + "num_tokens": 450989912.0, + "step": 17442 + }, + { + "epoch": 1.9155501866900946, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.485705614089966, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7054756879806519, + "num_tokens": 451014695.0, + "step": 17443 + }, + { + "epoch": 1.9156600043927081, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.623422622680664, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.697128176689148, + "num_tokens": 451035290.0, + "step": 17444 + }, + { + "epoch": 1.9157698220953217, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7937302589416504, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7181156873703003, + "num_tokens": 451057017.0, + "step": 17445 + }, + { + "epoch": 1.9158796397979354, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3538625240325928, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7220078110694885, + "num_tokens": 451081191.0, + "step": 17446 + }, + { + "epoch": 1.9159894575005492, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3807997703552246, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7189500331878662, + "num_tokens": 451107356.0, + "step": 17447 + }, + { + "epoch": 1.9160992752031627, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1319446563720703, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7001084089279175, + "num_tokens": 451142762.0, + "step": 17448 + }, + { + "epoch": 1.9162090929057765, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.289292097091675, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7173625230789185, + "num_tokens": 451170603.0, + "step": 17449 + }, + { + "epoch": 1.91631891060839, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.226414442062378, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6974740028381348, + "num_tokens": 451200087.0, + "step": 17450 + }, + { + "epoch": 1.9164287283110037, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.526700973510742, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6992638111114502, + "num_tokens": 451223252.0, + "step": 17451 + }, + { + "epoch": 1.9165385460136175, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.532409429550171, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.7035788297653198, + "num_tokens": 451245659.0, + "step": 17452 + }, + { + "epoch": 1.916648363716231, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.468550682067871, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7247864007949829, + "num_tokens": 451267037.0, + "step": 17453 + }, + { + "epoch": 1.9167581814188446, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.641753911972046, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7163492441177368, + "num_tokens": 451288055.0, + "step": 17454 + }, + { + "epoch": 1.9168679991214583, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.72601318359375, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.698280930519104, + "num_tokens": 451309635.0, + "step": 17455 + }, + { + "epoch": 1.916977816824072, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.278371810913086, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.711595356464386, + "num_tokens": 451341564.0, + "step": 17456 + }, + { + "epoch": 1.9170876345266858, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.297757148742676, + "learning_rate": 1e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.6855428218841553, + "num_tokens": 451369371.0, + "step": 17457 + }, + { + "epoch": 1.9171974522292994, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.458394765853882, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7188385128974915, + "num_tokens": 451393862.0, + "step": 17458 + }, + { + "epoch": 1.917307269931913, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.364048719406128, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7085330486297607, + "num_tokens": 451421284.0, + "step": 17459 + }, + { + "epoch": 1.9174170876345267, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.495481252670288, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6996637582778931, + "num_tokens": 451445744.0, + "step": 17460 + }, + { + "epoch": 1.9175269053371404, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3785600662231445, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7178162932395935, + "num_tokens": 451469195.0, + "step": 17461 + }, + { + "epoch": 1.917636723039754, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.489300489425659, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.711388349533081, + "num_tokens": 451492045.0, + "step": 17462 + }, + { + "epoch": 1.9177465407423677, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.685019016265869, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7112874984741211, + "num_tokens": 451514767.0, + "step": 17463 + }, + { + "epoch": 1.9178563584449813, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5477051734924316, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.714765191078186, + "num_tokens": 451541390.0, + "step": 17464 + }, + { + "epoch": 1.917966176147595, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4473979473114014, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7162713408470154, + "num_tokens": 451565160.0, + "step": 17465 + }, + { + "epoch": 1.9180759938502088, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.538947820663452, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6980481147766113, + "num_tokens": 451587544.0, + "step": 17466 + }, + { + "epoch": 1.9181858115528223, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.8633265495300293, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7356563210487366, + "num_tokens": 451612067.0, + "step": 17467 + }, + { + "epoch": 1.9182956292554358, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.362412691116333, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7300026416778564, + "num_tokens": 451638093.0, + "step": 17468 + }, + { + "epoch": 1.9184054469580496, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.369109630584717, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7125550508499146, + "num_tokens": 451665022.0, + "step": 17469 + }, + { + "epoch": 1.9185152646606634, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3376245498657227, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7004498839378357, + "num_tokens": 451692246.0, + "step": 17470 + }, + { + "epoch": 1.9186250823632771, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.553044080734253, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.735564649105072, + "num_tokens": 451715934.0, + "step": 17471 + }, + { + "epoch": 1.9187349000658906, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.129956007003784, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6834430694580078, + "num_tokens": 451751060.0, + "step": 17472 + }, + { + "epoch": 1.9188447177685042, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.169198751449585, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7040773034095764, + "num_tokens": 451782865.0, + "step": 17473 + }, + { + "epoch": 1.918954535471118, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.218780517578125, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.715789794921875, + "num_tokens": 451812176.0, + "step": 17474 + }, + { + "epoch": 1.9190643531737317, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.402973175048828, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7321571707725525, + "num_tokens": 451835676.0, + "step": 17475 + }, + { + "epoch": 1.9191741708763452, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2129688262939453, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7065247893333435, + "num_tokens": 451864951.0, + "step": 17476 + }, + { + "epoch": 1.9192839885789588, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4633002281188965, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7113918662071228, + "num_tokens": 451888418.0, + "step": 17477 + }, + { + "epoch": 1.9193938062815725, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2394227981567383, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6932587623596191, + "num_tokens": 451917245.0, + "step": 17478 + }, + { + "epoch": 1.9195036239841863, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.416222333908081, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7408246994018555, + "num_tokens": 451940714.0, + "step": 17479 + }, + { + "epoch": 1.9196134416868, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.25353741645813, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7020115852355957, + "num_tokens": 451971952.0, + "step": 17480 + }, + { + "epoch": 1.9197232593894136, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2683424949645996, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6933950185775757, + "num_tokens": 452002024.0, + "step": 17481 + }, + { + "epoch": 1.919833077092027, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3928728103637695, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7117080688476562, + "num_tokens": 452029407.0, + "step": 17482 + }, + { + "epoch": 1.9199428947946409, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2394378185272217, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7057564854621887, + "num_tokens": 452056850.0, + "step": 17483 + }, + { + "epoch": 1.9200527124972546, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.254366397857666, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7063207626342773, + "num_tokens": 452084598.0, + "step": 17484 + }, + { + "epoch": 1.9201625301998684, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.304339647293091, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.720509946346283, + "num_tokens": 452112922.0, + "step": 17485 + }, + { + "epoch": 1.920272347902482, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1482319831848145, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.701530933380127, + "num_tokens": 452142665.0, + "step": 17486 + }, + { + "epoch": 1.9203821656050954, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.447453022003174, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7133951783180237, + "num_tokens": 452165606.0, + "step": 17487 + }, + { + "epoch": 1.9204919833077092, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.641951084136963, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7371245622634888, + "num_tokens": 452184198.0, + "step": 17488 + }, + { + "epoch": 1.920601801010323, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6143407821655273, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7095069885253906, + "num_tokens": 452206770.0, + "step": 17489 + }, + { + "epoch": 1.9207116187129365, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.386868476867676, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.706295371055603, + "num_tokens": 452233802.0, + "step": 17490 + }, + { + "epoch": 1.92082143641555, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.785327672958374, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7137092351913452, + "num_tokens": 452257788.0, + "step": 17491 + }, + { + "epoch": 1.9209312541181638, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3343746662139893, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7046491503715515, + "num_tokens": 452285869.0, + "step": 17492 + }, + { + "epoch": 1.9210410718207775, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 7.068292617797852, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7183756828308105, + "num_tokens": 452317624.0, + "step": 17493 + }, + { + "epoch": 1.9211508895233913, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5525050163269043, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7163805365562439, + "num_tokens": 452341393.0, + "step": 17494 + }, + { + "epoch": 1.9212607072260048, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6988234519958496, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7105479836463928, + "num_tokens": 452366505.0, + "step": 17495 + }, + { + "epoch": 1.9213705249286184, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.673494815826416, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7328996658325195, + "num_tokens": 452387653.0, + "step": 17496 + }, + { + "epoch": 1.9214803426312321, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.483790397644043, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7258848547935486, + "num_tokens": 452410836.0, + "step": 17497 + }, + { + "epoch": 1.9215901603338459, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.129183053970337, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7039515376091003, + "num_tokens": 452441639.0, + "step": 17498 + }, + { + "epoch": 1.9216999780364594, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5935840606689453, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7068003416061401, + "num_tokens": 452462803.0, + "step": 17499 + }, + { + "epoch": 1.9218097957390732, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.533661365509033, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7241321206092834, + "num_tokens": 452485567.0, + "step": 17500 + }, + { + "epoch": 1.9219196134416867, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3767504692077637, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6960703134536743, + "num_tokens": 452512177.0, + "step": 17501 + }, + { + "epoch": 1.9220294311443005, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3709001541137695, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7174975872039795, + "num_tokens": 452540671.0, + "step": 17502 + }, + { + "epoch": 1.9221392488469142, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.155815362930298, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7113184928894043, + "num_tokens": 452572954.0, + "step": 17503 + }, + { + "epoch": 1.9222490665495278, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.567535638809204, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7071573138237, + "num_tokens": 452595664.0, + "step": 17504 + }, + { + "epoch": 1.9223588842521413, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6536123752593994, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7187621593475342, + "num_tokens": 452616085.0, + "step": 17505 + }, + { + "epoch": 1.922468701954755, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5327937602996826, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6924542784690857, + "num_tokens": 452641245.0, + "step": 17506 + }, + { + "epoch": 1.9225785196573688, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.384864568710327, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.701635479927063, + "num_tokens": 452666302.0, + "step": 17507 + }, + { + "epoch": 1.9226883373599826, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4406630992889404, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7151328325271606, + "num_tokens": 452689351.0, + "step": 17508 + }, + { + "epoch": 1.922798155062596, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.789651393890381, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7174244523048401, + "num_tokens": 452709981.0, + "step": 17509 + }, + { + "epoch": 1.9229079727652096, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2210373878479004, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7088661193847656, + "num_tokens": 452739967.0, + "step": 17510 + }, + { + "epoch": 1.9230177904678234, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.473867177963257, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.7114425897598267, + "num_tokens": 452766530.0, + "step": 17511 + }, + { + "epoch": 1.9231276081704372, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.28908634185791, + "learning_rate": 1e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.6872882843017578, + "num_tokens": 452795231.0, + "step": 17512 + }, + { + "epoch": 1.9232374258730507, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.245596170425415, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7075289487838745, + "num_tokens": 452823578.0, + "step": 17513 + }, + { + "epoch": 1.9233472435756644, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.206472396850586, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7021601796150208, + "num_tokens": 452855213.0, + "step": 17514 + }, + { + "epoch": 1.923457061278278, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.354945659637451, + "learning_rate": 1e-06, + "loss": 1.131, + "mean_token_accuracy": 0.6821385622024536, + "num_tokens": 452885761.0, + "step": 17515 + }, + { + "epoch": 1.9235668789808917, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.529374122619629, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.725893497467041, + "num_tokens": 452909656.0, + "step": 17516 + }, + { + "epoch": 1.9236766966835055, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0507984161376953, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7140552997589111, + "num_tokens": 452945783.0, + "step": 17517 + }, + { + "epoch": 1.923786514386119, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5908775329589844, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7166681289672852, + "num_tokens": 452967176.0, + "step": 17518 + }, + { + "epoch": 1.9238963320887326, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4593453407287598, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6914033889770508, + "num_tokens": 452991130.0, + "step": 17519 + }, + { + "epoch": 1.9240061497913463, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.239691972732544, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.721930980682373, + "num_tokens": 453017318.0, + "step": 17520 + }, + { + "epoch": 1.92411596749396, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.520824670791626, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6948012113571167, + "num_tokens": 453042097.0, + "step": 17521 + }, + { + "epoch": 1.9242257851965738, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.26669979095459, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7011072039604187, + "num_tokens": 453070639.0, + "step": 17522 + }, + { + "epoch": 1.9243356028991874, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.495260000228882, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7349770069122314, + "num_tokens": 453092593.0, + "step": 17523 + }, + { + "epoch": 1.924445420601801, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.114546060562134, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7349153757095337, + "num_tokens": 453121695.0, + "step": 17524 + }, + { + "epoch": 1.9245552383044147, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1938512325286865, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6990350484848022, + "num_tokens": 453150979.0, + "step": 17525 + }, + { + "epoch": 1.9246650560070284, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1270856857299805, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6971001029014587, + "num_tokens": 453184250.0, + "step": 17526 + }, + { + "epoch": 1.924774873709642, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3237810134887695, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6801828145980835, + "num_tokens": 453215976.0, + "step": 17527 + }, + { + "epoch": 1.9248846914122557, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.414067506790161, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7429238557815552, + "num_tokens": 453239722.0, + "step": 17528 + }, + { + "epoch": 1.9249945091148692, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2741754055023193, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6933849453926086, + "num_tokens": 453267887.0, + "step": 17529 + }, + { + "epoch": 1.925104326817483, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1996850967407227, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7000935673713684, + "num_tokens": 453297401.0, + "step": 17530 + }, + { + "epoch": 1.9252141445200968, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.108581304550171, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7132059335708618, + "num_tokens": 453328276.0, + "step": 17531 + }, + { + "epoch": 1.9253239622227103, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.138627767562866, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6933859586715698, + "num_tokens": 453361354.0, + "step": 17532 + }, + { + "epoch": 1.9254337799253238, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 4.477513313293457, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7142314910888672, + "num_tokens": 453385604.0, + "step": 17533 + }, + { + "epoch": 1.9255435976279376, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3595685958862305, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.72080397605896, + "num_tokens": 453411241.0, + "step": 17534 + }, + { + "epoch": 1.9256534153305513, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5361924171447754, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7402830123901367, + "num_tokens": 453432239.0, + "step": 17535 + }, + { + "epoch": 1.925763233033165, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.18125057220459, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7111562490463257, + "num_tokens": 453461818.0, + "step": 17536 + }, + { + "epoch": 1.9258730507357786, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.729931592941284, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7195207476615906, + "num_tokens": 453482038.0, + "step": 17537 + }, + { + "epoch": 1.9259828684383922, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3142309188842773, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7211044430732727, + "num_tokens": 453508784.0, + "step": 17538 + }, + { + "epoch": 1.926092686141006, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1919426918029785, + "learning_rate": 1e-06, + "loss": 1.1028, + "mean_token_accuracy": 0.6865609288215637, + "num_tokens": 453538687.0, + "step": 17539 + }, + { + "epoch": 1.9262025038436197, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4360122680664062, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6921056509017944, + "num_tokens": 453564911.0, + "step": 17540 + }, + { + "epoch": 1.9263123215462332, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1212778091430664, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6889021992683411, + "num_tokens": 453599779.0, + "step": 17541 + }, + { + "epoch": 1.9264221392488468, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.424604654312134, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7016634345054626, + "num_tokens": 453624549.0, + "step": 17542 + }, + { + "epoch": 1.9265319569514605, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4900295734405518, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7110775709152222, + "num_tokens": 453648337.0, + "step": 17543 + }, + { + "epoch": 1.9266417746540743, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2996511459350586, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7426286339759827, + "num_tokens": 453675954.0, + "step": 17544 + }, + { + "epoch": 1.926751592356688, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4063174724578857, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6906464099884033, + "num_tokens": 453702509.0, + "step": 17545 + }, + { + "epoch": 1.9268614100593016, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1919009685516357, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7282367944717407, + "num_tokens": 453730629.0, + "step": 17546 + }, + { + "epoch": 1.926971227761915, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.548536777496338, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7190053462982178, + "num_tokens": 453752699.0, + "step": 17547 + }, + { + "epoch": 1.9270810454645289, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3727447986602783, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6854619383811951, + "num_tokens": 453779843.0, + "step": 17548 + }, + { + "epoch": 1.9271908631671426, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.325838088989258, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7149418592453003, + "num_tokens": 453805125.0, + "step": 17549 + }, + { + "epoch": 1.9273006808697564, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3018300533294678, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6969889402389526, + "num_tokens": 453834983.0, + "step": 17550 + }, + { + "epoch": 1.92741049857237, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3548672199249268, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7224249243736267, + "num_tokens": 453861241.0, + "step": 17551 + }, + { + "epoch": 1.9275203162749834, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3862695693969727, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.704857349395752, + "num_tokens": 453887612.0, + "step": 17552 + }, + { + "epoch": 1.9276301339775972, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.565685987472534, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7337944507598877, + "num_tokens": 453909501.0, + "step": 17553 + }, + { + "epoch": 1.927739951680211, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6803908348083496, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7230708599090576, + "num_tokens": 453930561.0, + "step": 17554 + }, + { + "epoch": 1.9278497693828245, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2471976280212402, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6844927668571472, + "num_tokens": 453960082.0, + "step": 17555 + }, + { + "epoch": 1.927959587085438, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0685925483703613, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7097181081771851, + "num_tokens": 453994622.0, + "step": 17556 + }, + { + "epoch": 1.9280694047880518, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3428051471710205, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7260185480117798, + "num_tokens": 454019594.0, + "step": 17557 + }, + { + "epoch": 1.9281792224906655, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5336973667144775, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7424764037132263, + "num_tokens": 454042410.0, + "step": 17558 + }, + { + "epoch": 1.9282890401932793, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.047027349472046, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7185236215591431, + "num_tokens": 454074885.0, + "step": 17559 + }, + { + "epoch": 1.9283988578958928, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4428906440734863, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6934810876846313, + "num_tokens": 454099407.0, + "step": 17560 + }, + { + "epoch": 1.9285086755985064, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.267951011657715, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7113497257232666, + "num_tokens": 454125813.0, + "step": 17561 + }, + { + "epoch": 1.9286184933011201, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.218703031539917, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7362748384475708, + "num_tokens": 454152954.0, + "step": 17562 + }, + { + "epoch": 1.9287283110037339, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.306561231613159, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7178410291671753, + "num_tokens": 454179351.0, + "step": 17563 + }, + { + "epoch": 1.9288381287063474, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.29199481010437, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7070636749267578, + "num_tokens": 454205510.0, + "step": 17564 + }, + { + "epoch": 1.9289479464089612, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4669265747070312, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7245694994926453, + "num_tokens": 454228764.0, + "step": 17565 + }, + { + "epoch": 1.9290577641115747, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.680039644241333, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7220373153686523, + "num_tokens": 454248196.0, + "step": 17566 + }, + { + "epoch": 1.9291675818141885, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.530627965927124, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7265386581420898, + "num_tokens": 454269712.0, + "step": 17567 + }, + { + "epoch": 1.9292773995168022, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.32232403755188, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6897144317626953, + "num_tokens": 454297607.0, + "step": 17568 + }, + { + "epoch": 1.9293872172194158, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.518419027328491, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7113014459609985, + "num_tokens": 454321168.0, + "step": 17569 + }, + { + "epoch": 1.9294970349220293, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1760056018829346, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.708823561668396, + "num_tokens": 454352864.0, + "step": 17570 + }, + { + "epoch": 1.929606852624643, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4139041900634766, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7225996255874634, + "num_tokens": 454377003.0, + "step": 17571 + }, + { + "epoch": 1.9297166703272568, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4639928340911865, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7000719904899597, + "num_tokens": 454403443.0, + "step": 17572 + }, + { + "epoch": 1.9298264880298706, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.299010753631592, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7159041166305542, + "num_tokens": 454428688.0, + "step": 17573 + }, + { + "epoch": 1.929936305732484, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.875906229019165, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7380281090736389, + "num_tokens": 454448632.0, + "step": 17574 + }, + { + "epoch": 1.9300461234350976, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.241609811782837, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7219399213790894, + "num_tokens": 454478079.0, + "step": 17575 + }, + { + "epoch": 1.9301559411377114, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.455604314804077, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.721671462059021, + "num_tokens": 454503560.0, + "step": 17576 + }, + { + "epoch": 1.9302657588403251, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.401845932006836, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6959807276725769, + "num_tokens": 454532357.0, + "step": 17577 + }, + { + "epoch": 1.9303755765429387, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1638712882995605, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.704071581363678, + "num_tokens": 454561608.0, + "step": 17578 + }, + { + "epoch": 1.9304853942455524, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3099935054779053, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.7029150724411011, + "num_tokens": 454590535.0, + "step": 17579 + }, + { + "epoch": 1.930595211948166, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.43827748298645, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7298004627227783, + "num_tokens": 454613021.0, + "step": 17580 + }, + { + "epoch": 1.9307050296507797, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3409695625305176, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6932446956634521, + "num_tokens": 454638183.0, + "step": 17581 + }, + { + "epoch": 1.9308148473533935, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.363022804260254, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7160816192626953, + "num_tokens": 454664254.0, + "step": 17582 + }, + { + "epoch": 1.930924665056007, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0170421600341797, + "learning_rate": 1e-06, + "loss": 1.0801, + "mean_token_accuracy": 0.6885505318641663, + "num_tokens": 454702713.0, + "step": 17583 + }, + { + "epoch": 1.9310344827586206, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3056466579437256, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.7048490643501282, + "num_tokens": 454729770.0, + "step": 17584 + }, + { + "epoch": 1.9311443004612343, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.296243906021118, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7132538557052612, + "num_tokens": 454754170.0, + "step": 17585 + }, + { + "epoch": 1.931254118163848, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4101812839508057, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7040128707885742, + "num_tokens": 454779132.0, + "step": 17586 + }, + { + "epoch": 1.9313639358664618, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4575250148773193, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6992548108100891, + "num_tokens": 454803807.0, + "step": 17587 + }, + { + "epoch": 1.9314737535690754, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.425402879714966, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6941360831260681, + "num_tokens": 454831878.0, + "step": 17588 + }, + { + "epoch": 1.931583571271689, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.314347505569458, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7062975764274597, + "num_tokens": 454859770.0, + "step": 17589 + }, + { + "epoch": 1.9316933889743026, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.8006516695022583, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6985859870910645, + "num_tokens": 454905089.0, + "step": 17590 + }, + { + "epoch": 1.9318032066769164, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.214207649230957, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7109930515289307, + "num_tokens": 454934153.0, + "step": 17591 + }, + { + "epoch": 1.93191302437953, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.264235496520996, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6778057813644409, + "num_tokens": 454965708.0, + "step": 17592 + }, + { + "epoch": 1.9320228420821435, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3271231651306152, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6918244957923889, + "num_tokens": 454994316.0, + "step": 17593 + }, + { + "epoch": 1.9321326597847572, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.8465383052825928, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7242283821105957, + "num_tokens": 455016511.0, + "step": 17594 + }, + { + "epoch": 1.932242477487371, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.739943742752075, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7432488203048706, + "num_tokens": 455036822.0, + "step": 17595 + }, + { + "epoch": 1.9323522951899847, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3087620735168457, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7135448455810547, + "num_tokens": 455065624.0, + "step": 17596 + }, + { + "epoch": 1.9324621128925983, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.436079502105713, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7091418504714966, + "num_tokens": 455092259.0, + "step": 17597 + }, + { + "epoch": 1.9325719305952118, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.603273391723633, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7096480131149292, + "num_tokens": 455114492.0, + "step": 17598 + }, + { + "epoch": 1.9326817482978256, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5707640647888184, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7275899648666382, + "num_tokens": 455135634.0, + "step": 17599 + }, + { + "epoch": 1.9327915660004393, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3038721084594727, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7090613842010498, + "num_tokens": 455167182.0, + "step": 17600 + }, + { + "epoch": 1.932901383703053, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4361634254455566, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7086872458457947, + "num_tokens": 455191465.0, + "step": 17601 + }, + { + "epoch": 1.9330112014056666, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1100316047668457, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7126970291137695, + "num_tokens": 455223966.0, + "step": 17602 + }, + { + "epoch": 1.9331210191082802, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.259453535079956, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7087138891220093, + "num_tokens": 455250006.0, + "step": 17603 + }, + { + "epoch": 1.933230836810894, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5330755710601807, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7049729824066162, + "num_tokens": 455273183.0, + "step": 17604 + }, + { + "epoch": 1.9333406545135077, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2967073917388916, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6943439245223999, + "num_tokens": 455302408.0, + "step": 17605 + }, + { + "epoch": 1.9334504722161212, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3168463706970215, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7032352685928345, + "num_tokens": 455330213.0, + "step": 17606 + }, + { + "epoch": 1.9335602899187347, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.718761444091797, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6886715888977051, + "num_tokens": 455354541.0, + "step": 17607 + }, + { + "epoch": 1.9336701076213485, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4242844581604004, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7016314268112183, + "num_tokens": 455382122.0, + "step": 17608 + }, + { + "epoch": 1.9337799253239623, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3545217514038086, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7036008238792419, + "num_tokens": 455408109.0, + "step": 17609 + }, + { + "epoch": 1.933889743026576, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6507749557495117, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7207109332084656, + "num_tokens": 455429011.0, + "step": 17610 + }, + { + "epoch": 1.9339995607291895, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.487107992172241, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7139132022857666, + "num_tokens": 455453465.0, + "step": 17611 + }, + { + "epoch": 1.934109378431803, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.161604166030884, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.7047973275184631, + "num_tokens": 455484298.0, + "step": 17612 + }, + { + "epoch": 1.9342191961344168, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.372197151184082, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6978954076766968, + "num_tokens": 455510085.0, + "step": 17613 + }, + { + "epoch": 1.9343290138370306, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.407097339630127, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.692595362663269, + "num_tokens": 455534673.0, + "step": 17614 + }, + { + "epoch": 1.9344388315396444, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3663723468780518, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7061053514480591, + "num_tokens": 455559873.0, + "step": 17615 + }, + { + "epoch": 1.934548649242258, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.212324380874634, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7075178623199463, + "num_tokens": 455588007.0, + "step": 17616 + }, + { + "epoch": 1.9346584669448714, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3211028575897217, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7276086211204529, + "num_tokens": 455612967.0, + "step": 17617 + }, + { + "epoch": 1.9347682846474852, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.397930145263672, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7058713436126709, + "num_tokens": 455641419.0, + "step": 17618 + }, + { + "epoch": 1.934878102350099, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5755128860473633, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7179129719734192, + "num_tokens": 455663530.0, + "step": 17619 + }, + { + "epoch": 1.9349879200527125, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.272191047668457, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7107603549957275, + "num_tokens": 455690929.0, + "step": 17620 + }, + { + "epoch": 1.935097737755326, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.387066125869751, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7138094305992126, + "num_tokens": 455716597.0, + "step": 17621 + }, + { + "epoch": 1.9352075554579398, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3153560161590576, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6954693794250488, + "num_tokens": 455745358.0, + "step": 17622 + }, + { + "epoch": 1.9353173731605535, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.238929271697998, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6941967606544495, + "num_tokens": 455773826.0, + "step": 17623 + }, + { + "epoch": 1.9354271908631673, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.219156265258789, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6932957172393799, + "num_tokens": 455802065.0, + "step": 17624 + }, + { + "epoch": 1.9355370085657808, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1877920627593994, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7082940340042114, + "num_tokens": 455831596.0, + "step": 17625 + }, + { + "epoch": 1.9356468262683943, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2307651042938232, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6892433166503906, + "num_tokens": 455860969.0, + "step": 17626 + }, + { + "epoch": 1.935756643971008, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.562732219696045, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7093560695648193, + "num_tokens": 455884957.0, + "step": 17627 + }, + { + "epoch": 1.9358664616736219, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2356393337249756, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6908075213432312, + "num_tokens": 455912229.0, + "step": 17628 + }, + { + "epoch": 1.9359762793762354, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.280566930770874, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6922285556793213, + "num_tokens": 455939141.0, + "step": 17629 + }, + { + "epoch": 1.9360860970788492, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.338191270828247, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6963273882865906, + "num_tokens": 455970042.0, + "step": 17630 + }, + { + "epoch": 1.9361959147814627, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5312247276306152, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7277323007583618, + "num_tokens": 455991574.0, + "step": 17631 + }, + { + "epoch": 1.9363057324840764, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5073630809783936, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7160171270370483, + "num_tokens": 456014313.0, + "step": 17632 + }, + { + "epoch": 1.9364155501866902, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4216647148132324, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7276008129119873, + "num_tokens": 456038844.0, + "step": 17633 + }, + { + "epoch": 1.9365253678893037, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.277827024459839, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7002289295196533, + "num_tokens": 456065470.0, + "step": 17634 + }, + { + "epoch": 1.9366351855919173, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2121806144714355, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7185721397399902, + "num_tokens": 456094501.0, + "step": 17635 + }, + { + "epoch": 1.936745003294531, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3327319622039795, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7087641358375549, + "num_tokens": 456121203.0, + "step": 17636 + }, + { + "epoch": 1.9368548209971448, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4051873683929443, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6884015202522278, + "num_tokens": 456147979.0, + "step": 17637 + }, + { + "epoch": 1.9369646386997585, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.449620008468628, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7390412092208862, + "num_tokens": 456169219.0, + "step": 17638 + }, + { + "epoch": 1.937074456402372, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.212620973587036, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7000532746315002, + "num_tokens": 456197959.0, + "step": 17639 + }, + { + "epoch": 1.9371842741049856, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2823894023895264, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7086407542228699, + "num_tokens": 456225101.0, + "step": 17640 + }, + { + "epoch": 1.9372940918075994, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.447420835494995, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6951714158058167, + "num_tokens": 456251271.0, + "step": 17641 + }, + { + "epoch": 1.9374039095102131, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3414695262908936, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6932451725006104, + "num_tokens": 456279573.0, + "step": 17642 + }, + { + "epoch": 1.9375137272128267, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4291319847106934, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7120440006256104, + "num_tokens": 456304106.0, + "step": 17643 + }, + { + "epoch": 1.9376235449154404, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6516690254211426, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.703260064125061, + "num_tokens": 456326000.0, + "step": 17644 + }, + { + "epoch": 1.937733362618054, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.246107339859009, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.694165825843811, + "num_tokens": 456357308.0, + "step": 17645 + }, + { + "epoch": 1.9378431803206677, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4628870487213135, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.705474853515625, + "num_tokens": 456379764.0, + "step": 17646 + }, + { + "epoch": 1.9379529980232815, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.346092939376831, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7233616709709167, + "num_tokens": 456404692.0, + "step": 17647 + }, + { + "epoch": 1.938062815725895, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.433302640914917, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6978818774223328, + "num_tokens": 456429190.0, + "step": 17648 + }, + { + "epoch": 1.9381726334285085, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1344339847564697, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.710027277469635, + "num_tokens": 456459855.0, + "step": 17649 + }, + { + "epoch": 1.9382824511311223, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.174260139465332, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7059023380279541, + "num_tokens": 456489508.0, + "step": 17650 + }, + { + "epoch": 1.938392268833736, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4888792037963867, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6975550651550293, + "num_tokens": 456514346.0, + "step": 17651 + }, + { + "epoch": 1.9385020865363498, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.079669237136841, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6864126920700073, + "num_tokens": 456547450.0, + "step": 17652 + }, + { + "epoch": 1.9386119042389633, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.785390615463257, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7387114763259888, + "num_tokens": 456565621.0, + "step": 17653 + }, + { + "epoch": 1.9387217219415769, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.752769708633423, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7014358043670654, + "num_tokens": 456585911.0, + "step": 17654 + }, + { + "epoch": 1.9388315396441906, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.44252872467041, + "learning_rate": 1e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.69377201795578, + "num_tokens": 456614068.0, + "step": 17655 + }, + { + "epoch": 1.9389413573468044, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.214383602142334, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7042878270149231, + "num_tokens": 456646528.0, + "step": 17656 + }, + { + "epoch": 1.939051175049418, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4249815940856934, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6936133503913879, + "num_tokens": 456672194.0, + "step": 17657 + }, + { + "epoch": 1.9391609927520315, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.692115545272827, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7091662883758545, + "num_tokens": 456692698.0, + "step": 17658 + }, + { + "epoch": 1.9392708104546452, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4323084354400635, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7075783014297485, + "num_tokens": 456715664.0, + "step": 17659 + }, + { + "epoch": 1.939380628157259, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2943739891052246, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7124555110931396, + "num_tokens": 456741854.0, + "step": 17660 + }, + { + "epoch": 1.9394904458598727, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.218797206878662, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7008986473083496, + "num_tokens": 456768364.0, + "step": 17661 + }, + { + "epoch": 1.9396002635624863, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2124366760253906, + "learning_rate": 1e-06, + "loss": 1.1016, + "mean_token_accuracy": 0.6856042146682739, + "num_tokens": 456799532.0, + "step": 17662 + }, + { + "epoch": 1.9397100812650998, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4135043621063232, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7313038110733032, + "num_tokens": 456823868.0, + "step": 17663 + }, + { + "epoch": 1.9398198989677136, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.8344385623931885, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7402288913726807, + "num_tokens": 456843968.0, + "step": 17664 + }, + { + "epoch": 1.9399297166703273, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7579798698425293, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7241713404655457, + "num_tokens": 456862883.0, + "step": 17665 + }, + { + "epoch": 1.940039534372941, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.515573263168335, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7074629068374634, + "num_tokens": 456886445.0, + "step": 17666 + }, + { + "epoch": 1.9401493520755546, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5756616592407227, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7118369936943054, + "num_tokens": 456908085.0, + "step": 17667 + }, + { + "epoch": 1.9402591697781681, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.293555498123169, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7160001993179321, + "num_tokens": 456937381.0, + "step": 17668 + }, + { + "epoch": 1.940368987480782, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4424021244049072, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7209610939025879, + "num_tokens": 456961903.0, + "step": 17669 + }, + { + "epoch": 1.9404788051833957, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.451578378677368, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.685875415802002, + "num_tokens": 456986959.0, + "step": 17670 + }, + { + "epoch": 1.9405886228860092, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6647067070007324, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.704512894153595, + "num_tokens": 457006516.0, + "step": 17671 + }, + { + "epoch": 1.9406984405886227, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.104722023010254, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7207850217819214, + "num_tokens": 457038982.0, + "step": 17672 + }, + { + "epoch": 1.9408082582912365, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0589756965637207, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7205550670623779, + "num_tokens": 457070470.0, + "step": 17673 + }, + { + "epoch": 1.9409180759938502, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5986413955688477, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7288933396339417, + "num_tokens": 457090603.0, + "step": 17674 + }, + { + "epoch": 1.941027893696464, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.429033041000366, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7114992141723633, + "num_tokens": 457116720.0, + "step": 17675 + }, + { + "epoch": 1.9411377113990775, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.501589059829712, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7288074493408203, + "num_tokens": 457138482.0, + "step": 17676 + }, + { + "epoch": 1.941247529101691, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4318766593933105, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7188723087310791, + "num_tokens": 457162724.0, + "step": 17677 + }, + { + "epoch": 1.9413573468043048, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4151065349578857, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7240362763404846, + "num_tokens": 457187631.0, + "step": 17678 + }, + { + "epoch": 1.9414671645069186, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.218674898147583, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7078086137771606, + "num_tokens": 457217336.0, + "step": 17679 + }, + { + "epoch": 1.9415769822095323, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.495922088623047, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7272951602935791, + "num_tokens": 457239708.0, + "step": 17680 + }, + { + "epoch": 1.9416867999121459, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2262496948242188, + "learning_rate": 1e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6859012842178345, + "num_tokens": 457273370.0, + "step": 17681 + }, + { + "epoch": 1.9417966176147594, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.153877019882202, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7116600275039673, + "num_tokens": 457303652.0, + "step": 17682 + }, + { + "epoch": 1.9419064353173732, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5876035690307617, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.710577130317688, + "num_tokens": 457329030.0, + "step": 17683 + }, + { + "epoch": 1.942016253019987, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.958679676055908, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7261090874671936, + "num_tokens": 457346283.0, + "step": 17684 + }, + { + "epoch": 1.9421260707226005, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.862961530685425, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7175815105438232, + "num_tokens": 457366083.0, + "step": 17685 + }, + { + "epoch": 1.942235888425214, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.193814992904663, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6913220286369324, + "num_tokens": 457397897.0, + "step": 17686 + }, + { + "epoch": 1.9423457061278278, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3727288246154785, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7088403701782227, + "num_tokens": 457422345.0, + "step": 17687 + }, + { + "epoch": 1.9424555238304415, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2947638034820557, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7174760103225708, + "num_tokens": 457448229.0, + "step": 17688 + }, + { + "epoch": 1.9425653415330553, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.180570125579834, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7061014175415039, + "num_tokens": 457477440.0, + "step": 17689 + }, + { + "epoch": 1.9426751592356688, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.220107078552246, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6962740421295166, + "num_tokens": 457505166.0, + "step": 17690 + }, + { + "epoch": 1.9427849769382823, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1456222534179688, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6927176713943481, + "num_tokens": 457536226.0, + "step": 17691 + }, + { + "epoch": 1.942894794640896, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2465782165527344, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7073124051094055, + "num_tokens": 457565023.0, + "step": 17692 + }, + { + "epoch": 1.9430046123435099, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.301985502243042, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7164932489395142, + "num_tokens": 457592425.0, + "step": 17693 + }, + { + "epoch": 1.9431144300461234, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3824164867401123, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6980468034744263, + "num_tokens": 457621475.0, + "step": 17694 + }, + { + "epoch": 1.9432242477487371, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.337554693222046, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6983453631401062, + "num_tokens": 457652399.0, + "step": 17695 + }, + { + "epoch": 1.9433340654513507, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4459574222564697, + "learning_rate": 1e-06, + "loss": 1.112, + "mean_token_accuracy": 0.6740462183952332, + "num_tokens": 457680403.0, + "step": 17696 + }, + { + "epoch": 1.9434438831539644, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4110703468322754, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.706067681312561, + "num_tokens": 457706330.0, + "step": 17697 + }, + { + "epoch": 1.9435537008565782, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5090863704681396, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6870633363723755, + "num_tokens": 457728851.0, + "step": 17698 + }, + { + "epoch": 1.9436635185591917, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7464044094085693, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.715333104133606, + "num_tokens": 457749633.0, + "step": 17699 + }, + { + "epoch": 1.9437733362618053, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.563410758972168, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7077192068099976, + "num_tokens": 457774304.0, + "step": 17700 + }, + { + "epoch": 1.943883153964419, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2933530807495117, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7346069812774658, + "num_tokens": 457801445.0, + "step": 17701 + }, + { + "epoch": 1.9439929716670328, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3953123092651367, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7196202278137207, + "num_tokens": 457825155.0, + "step": 17702 + }, + { + "epoch": 1.9441027893696465, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3003432750701904, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.708118736743927, + "num_tokens": 457852221.0, + "step": 17703 + }, + { + "epoch": 1.94421260707226, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.341434955596924, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7030550241470337, + "num_tokens": 457879946.0, + "step": 17704 + }, + { + "epoch": 1.9443224247748736, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.162670373916626, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7098507285118103, + "num_tokens": 457910546.0, + "step": 17705 + }, + { + "epoch": 1.9444322424774874, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.536497116088867, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7232623100280762, + "num_tokens": 457933478.0, + "step": 17706 + }, + { + "epoch": 1.9445420601801011, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1714391708374023, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7055385708808899, + "num_tokens": 457964229.0, + "step": 17707 + }, + { + "epoch": 1.9446518778827147, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.378986358642578, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6992964744567871, + "num_tokens": 457988781.0, + "step": 17708 + }, + { + "epoch": 1.9447616955853284, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.34220814704895, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6963274478912354, + "num_tokens": 458016238.0, + "step": 17709 + }, + { + "epoch": 1.944871513287942, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2063610553741455, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.685815155506134, + "num_tokens": 458047596.0, + "step": 17710 + }, + { + "epoch": 1.9449813309905557, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1610660552978516, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6904956698417664, + "num_tokens": 458077736.0, + "step": 17711 + }, + { + "epoch": 1.9450911486931695, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5566275119781494, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7069703340530396, + "num_tokens": 458102156.0, + "step": 17712 + }, + { + "epoch": 1.945200966395783, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.234205722808838, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7162884473800659, + "num_tokens": 458132768.0, + "step": 17713 + }, + { + "epoch": 1.9453107840983965, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.327333450317383, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6961528658866882, + "num_tokens": 458159677.0, + "step": 17714 + }, + { + "epoch": 1.9454206018010103, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.388294219970703, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7094444036483765, + "num_tokens": 458184722.0, + "step": 17715 + }, + { + "epoch": 1.945530419503624, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0663650035858154, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7091813683509827, + "num_tokens": 458217156.0, + "step": 17716 + }, + { + "epoch": 1.9456402372062378, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5758461952209473, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7118342518806458, + "num_tokens": 458238832.0, + "step": 17717 + }, + { + "epoch": 1.9457500549088513, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5130436420440674, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7013247013092041, + "num_tokens": 458262370.0, + "step": 17718 + }, + { + "epoch": 1.9458598726114649, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.758641004562378, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7300684452056885, + "num_tokens": 458281145.0, + "step": 17719 + }, + { + "epoch": 1.9459696903140786, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3568313121795654, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7107550501823425, + "num_tokens": 458308876.0, + "step": 17720 + }, + { + "epoch": 1.9460795080166924, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1960811614990234, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6924978494644165, + "num_tokens": 458338453.0, + "step": 17721 + }, + { + "epoch": 1.946189325719306, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.332578420639038, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.698075532913208, + "num_tokens": 458364645.0, + "step": 17722 + }, + { + "epoch": 1.9462991434219195, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.521596908569336, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6986217498779297, + "num_tokens": 458389238.0, + "step": 17723 + }, + { + "epoch": 1.9464089611245332, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4772729873657227, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7066261768341064, + "num_tokens": 458412718.0, + "step": 17724 + }, + { + "epoch": 1.946518778827147, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3368031978607178, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.727220892906189, + "num_tokens": 458440883.0, + "step": 17725 + }, + { + "epoch": 1.9466285965297607, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1464481353759766, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.701888918876648, + "num_tokens": 458471234.0, + "step": 17726 + }, + { + "epoch": 1.9467384142323743, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.728973865509033, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7032462358474731, + "num_tokens": 458491313.0, + "step": 17727 + }, + { + "epoch": 1.9468482319349878, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1572697162628174, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7181813716888428, + "num_tokens": 458521269.0, + "step": 17728 + }, + { + "epoch": 1.9469580496376016, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3528060913085938, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6979085206985474, + "num_tokens": 458549404.0, + "step": 17729 + }, + { + "epoch": 1.9470678673402153, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.371763229370117, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6948962211608887, + "num_tokens": 458576406.0, + "step": 17730 + }, + { + "epoch": 1.947177685042829, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.497136116027832, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7132365107536316, + "num_tokens": 458601780.0, + "step": 17731 + }, + { + "epoch": 1.9472875027454426, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6420187950134277, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7266287207603455, + "num_tokens": 458626965.0, + "step": 17732 + }, + { + "epoch": 1.9473973204480561, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.384655475616455, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.72599196434021, + "num_tokens": 458652567.0, + "step": 17733 + }, + { + "epoch": 1.94750713815067, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5369672775268555, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7013497352600098, + "num_tokens": 458678063.0, + "step": 17734 + }, + { + "epoch": 1.9476169558532836, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.406127452850342, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7203183174133301, + "num_tokens": 458703002.0, + "step": 17735 + }, + { + "epoch": 1.9477267735558972, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 3.055711269378662, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7188297510147095, + "num_tokens": 458726273.0, + "step": 17736 + }, + { + "epoch": 1.9478365912585107, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4795734882354736, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.711727499961853, + "num_tokens": 458748947.0, + "step": 17737 + }, + { + "epoch": 1.9479464089611245, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.477289915084839, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7081767320632935, + "num_tokens": 458773159.0, + "step": 17738 + }, + { + "epoch": 1.9480562266637382, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2455103397369385, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7018465995788574, + "num_tokens": 458802385.0, + "step": 17739 + }, + { + "epoch": 1.948166044366352, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.376554489135742, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7122775316238403, + "num_tokens": 458828374.0, + "step": 17740 + }, + { + "epoch": 1.9482758620689655, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6496684551239014, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6963823437690735, + "num_tokens": 458852194.0, + "step": 17741 + }, + { + "epoch": 1.948385679771579, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2270190715789795, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7282382249832153, + "num_tokens": 458879585.0, + "step": 17742 + }, + { + "epoch": 1.9484954974741928, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1674652099609375, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7103089690208435, + "num_tokens": 458910244.0, + "step": 17743 + }, + { + "epoch": 1.9486053151768066, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2729365825653076, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7061439752578735, + "num_tokens": 458937185.0, + "step": 17744 + }, + { + "epoch": 1.94871513287942, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.32735538482666, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7287250757217407, + "num_tokens": 458963727.0, + "step": 17745 + }, + { + "epoch": 1.9488249505820339, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1573359966278076, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.692819356918335, + "num_tokens": 458996251.0, + "step": 17746 + }, + { + "epoch": 1.9489347682846474, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.124661922454834, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7056539058685303, + "num_tokens": 459027137.0, + "step": 17747 + }, + { + "epoch": 1.9490445859872612, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4772064685821533, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7259379625320435, + "num_tokens": 459050086.0, + "step": 17748 + }, + { + "epoch": 1.949154403689875, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.435999870300293, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7247520089149475, + "num_tokens": 459073889.0, + "step": 17749 + }, + { + "epoch": 1.9492642213924884, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3876960277557373, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.7036212682723999, + "num_tokens": 459100458.0, + "step": 17750 + }, + { + "epoch": 1.949374039095102, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6996374130249023, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7265095114707947, + "num_tokens": 459119890.0, + "step": 17751 + }, + { + "epoch": 1.9494838567977157, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.406362533569336, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.697698712348938, + "num_tokens": 459147450.0, + "step": 17752 + }, + { + "epoch": 1.9495936745003295, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2978522777557373, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.737945020198822, + "num_tokens": 459172636.0, + "step": 17753 + }, + { + "epoch": 1.9497034922029433, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4333012104034424, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.69772869348526, + "num_tokens": 459200885.0, + "step": 17754 + }, + { + "epoch": 1.9498133099055568, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.212078094482422, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7205033302307129, + "num_tokens": 459229408.0, + "step": 17755 + }, + { + "epoch": 1.9499231276081703, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5157132148742676, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7298030257225037, + "num_tokens": 459254265.0, + "step": 17756 + }, + { + "epoch": 1.950032945310784, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.449326515197754, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6998240947723389, + "num_tokens": 459280332.0, + "step": 17757 + }, + { + "epoch": 1.9501427630133978, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3476755619049072, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7153638601303101, + "num_tokens": 459305595.0, + "step": 17758 + }, + { + "epoch": 1.9502525807160114, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7013139724731445, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7342687845230103, + "num_tokens": 459325299.0, + "step": 17759 + }, + { + "epoch": 1.9503623984186251, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.387054681777954, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7070486545562744, + "num_tokens": 459352836.0, + "step": 17760 + }, + { + "epoch": 1.9504722161212387, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4063920974731445, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6954861283302307, + "num_tokens": 459379227.0, + "step": 17761 + }, + { + "epoch": 1.9505820338238524, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2386624813079834, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6897965669631958, + "num_tokens": 459411692.0, + "step": 17762 + }, + { + "epoch": 1.9506918515264662, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.02396821975708, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7173323035240173, + "num_tokens": 459445915.0, + "step": 17763 + }, + { + "epoch": 1.9508016692290797, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.461188793182373, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6931411027908325, + "num_tokens": 459471396.0, + "step": 17764 + }, + { + "epoch": 1.9509114869316933, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.650502920150757, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7121312022209167, + "num_tokens": 459493047.0, + "step": 17765 + }, + { + "epoch": 1.951021304634307, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6695284843444824, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.716667652130127, + "num_tokens": 459515684.0, + "step": 17766 + }, + { + "epoch": 1.9511311223369208, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.406670093536377, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7169969081878662, + "num_tokens": 459540503.0, + "step": 17767 + }, + { + "epoch": 1.9512409400395345, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2413597106933594, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7060784101486206, + "num_tokens": 459569352.0, + "step": 17768 + }, + { + "epoch": 1.951350757742148, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6492910385131836, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7347657680511475, + "num_tokens": 459591727.0, + "step": 17769 + }, + { + "epoch": 1.9514605754447616, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6263999938964844, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.725278377532959, + "num_tokens": 459613008.0, + "step": 17770 + }, + { + "epoch": 1.9515703931473753, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3157079219818115, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7163743376731873, + "num_tokens": 459640519.0, + "step": 17771 + }, + { + "epoch": 1.951680210849989, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.309328079223633, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7071802020072937, + "num_tokens": 459668946.0, + "step": 17772 + }, + { + "epoch": 1.9517900285526026, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0437939167022705, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.717602014541626, + "num_tokens": 459701833.0, + "step": 17773 + }, + { + "epoch": 1.9518998462552162, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2734482288360596, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7211862206459045, + "num_tokens": 459729092.0, + "step": 17774 + }, + { + "epoch": 1.95200966395783, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4471590518951416, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6968077421188354, + "num_tokens": 459755210.0, + "step": 17775 + }, + { + "epoch": 1.9521194816604437, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1524641513824463, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6872004866600037, + "num_tokens": 459787978.0, + "step": 17776 + }, + { + "epoch": 1.9522292993630574, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.364166259765625, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7217036485671997, + "num_tokens": 459814198.0, + "step": 17777 + }, + { + "epoch": 1.952339117065671, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1269161701202393, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7381123304367065, + "num_tokens": 459843631.0, + "step": 17778 + }, + { + "epoch": 1.9524489347682845, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5454506874084473, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7192323207855225, + "num_tokens": 459865903.0, + "step": 17779 + }, + { + "epoch": 1.9525587524708983, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5978808403015137, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6912103891372681, + "num_tokens": 459890911.0, + "step": 17780 + }, + { + "epoch": 1.952668570173512, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.296194076538086, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7238832712173462, + "num_tokens": 459916634.0, + "step": 17781 + }, + { + "epoch": 1.9527783878761258, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.218677282333374, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6978691220283508, + "num_tokens": 459945596.0, + "step": 17782 + }, + { + "epoch": 1.9528882055787393, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3222813606262207, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6771304607391357, + "num_tokens": 459971509.0, + "step": 17783 + }, + { + "epoch": 1.9529980232813529, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4558675289154053, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7058107256889343, + "num_tokens": 459995866.0, + "step": 17784 + }, + { + "epoch": 1.9531078409839666, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5272216796875, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7102099657058716, + "num_tokens": 460019422.0, + "step": 17785 + }, + { + "epoch": 1.9532176586865804, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2832770347595215, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.7001057863235474, + "num_tokens": 460051120.0, + "step": 17786 + }, + { + "epoch": 1.953327476389194, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3956313133239746, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7189082503318787, + "num_tokens": 460075998.0, + "step": 17787 + }, + { + "epoch": 1.9534372940918074, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.377127170562744, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7297857999801636, + "num_tokens": 460102115.0, + "step": 17788 + }, + { + "epoch": 1.9535471117944212, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.688795566558838, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.707341194152832, + "num_tokens": 460124918.0, + "step": 17789 + }, + { + "epoch": 1.953656929497035, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3876736164093018, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7028645277023315, + "num_tokens": 460151773.0, + "step": 17790 + }, + { + "epoch": 1.9537667471996487, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2659051418304443, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.704349160194397, + "num_tokens": 460180962.0, + "step": 17791 + }, + { + "epoch": 1.9538765649022622, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5525825023651123, + "learning_rate": 1e-06, + "loss": 1.0893, + "mean_token_accuracy": 0.6865111589431763, + "num_tokens": 460204462.0, + "step": 17792 + }, + { + "epoch": 1.9539863826048758, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.397892713546753, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6944549083709717, + "num_tokens": 460231154.0, + "step": 17793 + }, + { + "epoch": 1.9540962003074895, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.751039981842041, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7260887622833252, + "num_tokens": 460251325.0, + "step": 17794 + }, + { + "epoch": 1.9542060180101033, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.475656509399414, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7024037837982178, + "num_tokens": 460275225.0, + "step": 17795 + }, + { + "epoch": 1.954315835712717, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4573423862457275, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7216907739639282, + "num_tokens": 460300141.0, + "step": 17796 + }, + { + "epoch": 1.9544256534153306, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.645453929901123, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7208890318870544, + "num_tokens": 460321850.0, + "step": 17797 + }, + { + "epoch": 1.9545354711179441, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0545029640197754, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.728190541267395, + "num_tokens": 460353093.0, + "step": 17798 + }, + { + "epoch": 1.9546452888205579, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3802781105041504, + "learning_rate": 1e-06, + "loss": 1.1336, + "mean_token_accuracy": 0.6783604025840759, + "num_tokens": 460380598.0, + "step": 17799 + }, + { + "epoch": 1.9547551065231716, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3668417930603027, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7035056352615356, + "num_tokens": 460405826.0, + "step": 17800 + }, + { + "epoch": 1.9548649242257852, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.50185489654541, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7129725217819214, + "num_tokens": 460429248.0, + "step": 17801 + }, + { + "epoch": 1.9549747419283987, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3981668949127197, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7299710512161255, + "num_tokens": 460453280.0, + "step": 17802 + }, + { + "epoch": 1.9550845596310125, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4745569229125977, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7152867317199707, + "num_tokens": 460476265.0, + "step": 17803 + }, + { + "epoch": 1.9551943773336262, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.13177227973938, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7083806395530701, + "num_tokens": 460506821.0, + "step": 17804 + }, + { + "epoch": 1.95530419503624, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.250270128250122, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6941736936569214, + "num_tokens": 460535198.0, + "step": 17805 + }, + { + "epoch": 1.9554140127388535, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4841971397399902, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6991500854492188, + "num_tokens": 460559697.0, + "step": 17806 + }, + { + "epoch": 1.955523830441467, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3118185997009277, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7007338404655457, + "num_tokens": 460587873.0, + "step": 17807 + }, + { + "epoch": 1.9556336481440808, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3450729846954346, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6896184086799622, + "num_tokens": 460613492.0, + "step": 17808 + }, + { + "epoch": 1.9557434658466946, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5795934200286865, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7291760444641113, + "num_tokens": 460633636.0, + "step": 17809 + }, + { + "epoch": 1.955853283549308, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2648725509643555, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7226166129112244, + "num_tokens": 460660312.0, + "step": 17810 + }, + { + "epoch": 1.9559631012519219, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.8472204208374023, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7126840353012085, + "num_tokens": 460678619.0, + "step": 17811 + }, + { + "epoch": 1.9560729189545354, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5974555015563965, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7241871356964111, + "num_tokens": 460699511.0, + "step": 17812 + }, + { + "epoch": 1.9561827366571491, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2910168170928955, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7136788368225098, + "num_tokens": 460726563.0, + "step": 17813 + }, + { + "epoch": 1.956292554359763, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1656060218811035, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7278510332107544, + "num_tokens": 460755421.0, + "step": 17814 + }, + { + "epoch": 1.9564023720623764, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.308842420578003, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7121662497520447, + "num_tokens": 460781504.0, + "step": 17815 + }, + { + "epoch": 1.95651218976499, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2821500301361084, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7159658670425415, + "num_tokens": 460808805.0, + "step": 17816 + }, + { + "epoch": 1.9566220074676037, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2578365802764893, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.7024056911468506, + "num_tokens": 460836943.0, + "step": 17817 + }, + { + "epoch": 1.9567318251702175, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3790369033813477, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6998335123062134, + "num_tokens": 460863149.0, + "step": 17818 + }, + { + "epoch": 1.9568416428728312, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2829644680023193, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7218411564826965, + "num_tokens": 460891102.0, + "step": 17819 + }, + { + "epoch": 1.9569514605754448, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.27530837059021, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6975476145744324, + "num_tokens": 460917830.0, + "step": 17820 + }, + { + "epoch": 1.9570612782780583, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.253328800201416, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.7026294469833374, + "num_tokens": 460943706.0, + "step": 17821 + }, + { + "epoch": 1.957171095980672, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.20725679397583, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7244987487792969, + "num_tokens": 460970926.0, + "step": 17822 + }, + { + "epoch": 1.9572809136832858, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3152825832366943, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7198930978775024, + "num_tokens": 460995967.0, + "step": 17823 + }, + { + "epoch": 1.9573907313858994, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1849844455718994, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.7042778134346008, + "num_tokens": 461026643.0, + "step": 17824 + }, + { + "epoch": 1.9575005490885131, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.400113344192505, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6986373662948608, + "num_tokens": 461052393.0, + "step": 17825 + }, + { + "epoch": 1.9576103667911267, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2159321308135986, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.7028409242630005, + "num_tokens": 461083688.0, + "step": 17826 + }, + { + "epoch": 1.9577201844937404, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4718852043151855, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7089783549308777, + "num_tokens": 461106965.0, + "step": 17827 + }, + { + "epoch": 1.9578300021963542, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3102352619171143, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6814941167831421, + "num_tokens": 461134676.0, + "step": 17828 + }, + { + "epoch": 1.9579398198989677, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3381330966949463, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.69000244140625, + "num_tokens": 461163256.0, + "step": 17829 + }, + { + "epoch": 1.9580496376015812, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5640318393707275, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7293980121612549, + "num_tokens": 461186258.0, + "step": 17830 + }, + { + "epoch": 1.958159455304195, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.158294439315796, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6964981555938721, + "num_tokens": 461218074.0, + "step": 17831 + }, + { + "epoch": 1.9582692730068088, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2757463455200195, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7156745791435242, + "num_tokens": 461243234.0, + "step": 17832 + }, + { + "epoch": 1.9583790907094225, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2153573036193848, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.704230546951294, + "num_tokens": 461272954.0, + "step": 17833 + }, + { + "epoch": 1.958488908412036, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4223954677581787, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7318371534347534, + "num_tokens": 461295956.0, + "step": 17834 + }, + { + "epoch": 1.9585987261146496, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.32667875289917, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7092865705490112, + "num_tokens": 461323378.0, + "step": 17835 + }, + { + "epoch": 1.9587085438172633, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.278933048248291, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7022274732589722, + "num_tokens": 461349861.0, + "step": 17836 + }, + { + "epoch": 1.958818361519877, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.691248893737793, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7220790386199951, + "num_tokens": 461370899.0, + "step": 17837 + }, + { + "epoch": 1.9589281792224906, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.417593479156494, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7262922525405884, + "num_tokens": 461397994.0, + "step": 17838 + }, + { + "epoch": 1.9590379969251042, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5612051486968994, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7126616835594177, + "num_tokens": 461421247.0, + "step": 17839 + }, + { + "epoch": 1.959147814627718, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5317492485046387, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7047203779220581, + "num_tokens": 461443922.0, + "step": 17840 + }, + { + "epoch": 1.9592576323303317, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5714540481567383, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7160605788230896, + "num_tokens": 461466502.0, + "step": 17841 + }, + { + "epoch": 1.9593674500329454, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.13348388671875, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6907439231872559, + "num_tokens": 461502352.0, + "step": 17842 + }, + { + "epoch": 1.959477267735559, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4171385765075684, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7171269655227661, + "num_tokens": 461526845.0, + "step": 17843 + }, + { + "epoch": 1.9595870854381725, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2705326080322266, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7287324666976929, + "num_tokens": 461552144.0, + "step": 17844 + }, + { + "epoch": 1.9596969031407863, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1862809658050537, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7139874696731567, + "num_tokens": 461581473.0, + "step": 17845 + }, + { + "epoch": 1.9598067208434, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2874224185943604, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7219350934028625, + "num_tokens": 461607343.0, + "step": 17846 + }, + { + "epoch": 1.9599165385460138, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2564444541931152, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7055437564849854, + "num_tokens": 461634676.0, + "step": 17847 + }, + { + "epoch": 1.9600263562486273, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1405677795410156, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6981734037399292, + "num_tokens": 461667066.0, + "step": 17848 + }, + { + "epoch": 1.9601361739512408, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.603341817855835, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7597143650054932, + "num_tokens": 461687263.0, + "step": 17849 + }, + { + "epoch": 1.9602459916538546, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.281684637069702, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7292290329933167, + "num_tokens": 461712768.0, + "step": 17850 + }, + { + "epoch": 1.9603558093564684, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3602237701416016, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6925100088119507, + "num_tokens": 461738675.0, + "step": 17851 + }, + { + "epoch": 1.960465627059082, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3445985317230225, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7096307873725891, + "num_tokens": 461764686.0, + "step": 17852 + }, + { + "epoch": 1.9605754447616954, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.205787181854248, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7100441455841064, + "num_tokens": 461792000.0, + "step": 17853 + }, + { + "epoch": 1.9606852624643092, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.617121934890747, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7069961428642273, + "num_tokens": 461815714.0, + "step": 17854 + }, + { + "epoch": 1.960795080166923, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4097416400909424, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7075389623641968, + "num_tokens": 461840276.0, + "step": 17855 + }, + { + "epoch": 1.9609048978695367, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.208162784576416, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6863102316856384, + "num_tokens": 461871940.0, + "step": 17856 + }, + { + "epoch": 1.9610147155721502, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2825493812561035, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7273486256599426, + "num_tokens": 461900224.0, + "step": 17857 + }, + { + "epoch": 1.9611245332747638, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3560376167297363, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7120873332023621, + "num_tokens": 461925930.0, + "step": 17858 + }, + { + "epoch": 1.9612343509773775, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.386159896850586, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7135403752326965, + "num_tokens": 461951480.0, + "step": 17859 + }, + { + "epoch": 1.9613441686799913, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.401277542114258, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7089887261390686, + "num_tokens": 461975796.0, + "step": 17860 + }, + { + "epoch": 1.961453986382605, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.677203893661499, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7091094255447388, + "num_tokens": 461996344.0, + "step": 17861 + }, + { + "epoch": 1.9615638040852186, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.310019016265869, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.724047064781189, + "num_tokens": 462023553.0, + "step": 17862 + }, + { + "epoch": 1.9616736217878321, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2648251056671143, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6962549090385437, + "num_tokens": 462049915.0, + "step": 17863 + }, + { + "epoch": 1.9617834394904459, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.204606056213379, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.695540726184845, + "num_tokens": 462079966.0, + "step": 17864 + }, + { + "epoch": 1.9618932571930596, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5773274898529053, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7080804109573364, + "num_tokens": 462103348.0, + "step": 17865 + }, + { + "epoch": 1.9620030748956732, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4291276931762695, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7095806002616882, + "num_tokens": 462131788.0, + "step": 17866 + }, + { + "epoch": 1.9621128925982867, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.677525281906128, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7095569372177124, + "num_tokens": 462154304.0, + "step": 17867 + }, + { + "epoch": 1.9622227103009005, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.39595627784729, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7248724699020386, + "num_tokens": 462180987.0, + "step": 17868 + }, + { + "epoch": 1.9623325280035142, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.169219970703125, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7107267379760742, + "num_tokens": 462210465.0, + "step": 17869 + }, + { + "epoch": 1.962442345706128, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3259904384613037, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.7064685821533203, + "num_tokens": 462236827.0, + "step": 17870 + }, + { + "epoch": 1.9625521634087415, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.353624105453491, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7138375043869019, + "num_tokens": 462264662.0, + "step": 17871 + }, + { + "epoch": 1.962661981111355, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5149855613708496, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7126691341400146, + "num_tokens": 462288633.0, + "step": 17872 + }, + { + "epoch": 1.9627717988139688, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.351823568344116, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6996322870254517, + "num_tokens": 462315207.0, + "step": 17873 + }, + { + "epoch": 1.9628816165165826, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.373197555541992, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7006530165672302, + "num_tokens": 462343361.0, + "step": 17874 + }, + { + "epoch": 1.962991434219196, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5989019870758057, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7042293548583984, + "num_tokens": 462365597.0, + "step": 17875 + }, + { + "epoch": 1.9631012519218098, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.372042655944824, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6934360861778259, + "num_tokens": 462391846.0, + "step": 17876 + }, + { + "epoch": 1.9632110696244234, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3284213542938232, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7101926803588867, + "num_tokens": 462418944.0, + "step": 17877 + }, + { + "epoch": 1.9633208873270371, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3666136264801025, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7029589414596558, + "num_tokens": 462446632.0, + "step": 17878 + }, + { + "epoch": 1.963430705029651, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4306955337524414, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7247992157936096, + "num_tokens": 462471488.0, + "step": 17879 + }, + { + "epoch": 1.9635405227322644, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.275425434112549, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6857735514640808, + "num_tokens": 462499777.0, + "step": 17880 + }, + { + "epoch": 1.963650340434878, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.58341646194458, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7081751823425293, + "num_tokens": 462521390.0, + "step": 17881 + }, + { + "epoch": 1.9637601581374917, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.429442882537842, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7156490087509155, + "num_tokens": 462548402.0, + "step": 17882 + }, + { + "epoch": 1.9638699758401055, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.260279417037964, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7123028039932251, + "num_tokens": 462579022.0, + "step": 17883 + }, + { + "epoch": 1.9639797935427192, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.609163761138916, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.72606360912323, + "num_tokens": 462600350.0, + "step": 17884 + }, + { + "epoch": 1.9640896112453328, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.188394069671631, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7064775228500366, + "num_tokens": 462628836.0, + "step": 17885 + }, + { + "epoch": 1.9641994289479463, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7025792598724365, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7198398113250732, + "num_tokens": 462648404.0, + "step": 17886 + }, + { + "epoch": 1.96430924665056, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.213954210281372, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7212225794792175, + "num_tokens": 462675662.0, + "step": 17887 + }, + { + "epoch": 1.9644190643531738, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3434627056121826, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7024913430213928, + "num_tokens": 462702373.0, + "step": 17888 + }, + { + "epoch": 1.9645288820557874, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5803864002227783, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7171958684921265, + "num_tokens": 462724181.0, + "step": 17889 + }, + { + "epoch": 1.964638699758401, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.336580753326416, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7013185024261475, + "num_tokens": 462751807.0, + "step": 17890 + }, + { + "epoch": 1.9647485174610146, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1981513500213623, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7080140709877014, + "num_tokens": 462783331.0, + "step": 17891 + }, + { + "epoch": 1.9648583351636284, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2457399368286133, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7231519222259521, + "num_tokens": 462812266.0, + "step": 17892 + }, + { + "epoch": 1.9649681528662422, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2622437477111816, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7152807712554932, + "num_tokens": 462841811.0, + "step": 17893 + }, + { + "epoch": 1.9650779705688557, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.323972463607788, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7115235328674316, + "num_tokens": 462870788.0, + "step": 17894 + }, + { + "epoch": 1.9651877882714692, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.302961826324463, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6881994009017944, + "num_tokens": 462899120.0, + "step": 17895 + }, + { + "epoch": 1.965297605974083, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.173708200454712, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7211887836456299, + "num_tokens": 462928270.0, + "step": 17896 + }, + { + "epoch": 1.9654074236766967, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0654077529907227, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6934272050857544, + "num_tokens": 462960055.0, + "step": 17897 + }, + { + "epoch": 1.9655172413793105, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.299440383911133, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.711686909198761, + "num_tokens": 462989831.0, + "step": 17898 + }, + { + "epoch": 1.965627059081924, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6887149810791016, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6912994384765625, + "num_tokens": 463011219.0, + "step": 17899 + }, + { + "epoch": 1.9657368767845376, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.001012086868286, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6859763860702515, + "num_tokens": 463047470.0, + "step": 17900 + }, + { + "epoch": 1.9658466944871513, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7690060138702393, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7142904996871948, + "num_tokens": 463068746.0, + "step": 17901 + }, + { + "epoch": 1.965956512189765, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.408062219619751, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7186237573623657, + "num_tokens": 463095132.0, + "step": 17902 + }, + { + "epoch": 1.9660663298923786, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.0786142349243164, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7244851589202881, + "num_tokens": 463124970.0, + "step": 17903 + }, + { + "epoch": 1.9661761475949922, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3178834915161133, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7071018815040588, + "num_tokens": 463154144.0, + "step": 17904 + }, + { + "epoch": 1.966285965297606, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6045777797698975, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7058511972427368, + "num_tokens": 463179158.0, + "step": 17905 + }, + { + "epoch": 1.9663957830002197, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.260504961013794, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7414861917495728, + "num_tokens": 463206314.0, + "step": 17906 + }, + { + "epoch": 1.9665056007028334, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7196500301361084, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7090312838554382, + "num_tokens": 463228266.0, + "step": 17907 + }, + { + "epoch": 1.966615418405447, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3094258308410645, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.696256697177887, + "num_tokens": 463255744.0, + "step": 17908 + }, + { + "epoch": 1.9667252361080605, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.187467575073242, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7006279230117798, + "num_tokens": 463287159.0, + "step": 17909 + }, + { + "epoch": 1.9668350538106742, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.239851951599121, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7391717433929443, + "num_tokens": 463313671.0, + "step": 17910 + }, + { + "epoch": 1.966944871513288, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.455177068710327, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7057779431343079, + "num_tokens": 463338766.0, + "step": 17911 + }, + { + "epoch": 1.9670546892159018, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.333174705505371, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7153993844985962, + "num_tokens": 463365966.0, + "step": 17912 + }, + { + "epoch": 1.9671645069185153, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4054174423217773, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7105118036270142, + "num_tokens": 463389649.0, + "step": 17913 + }, + { + "epoch": 1.9672743246211288, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2438228130340576, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7061041593551636, + "num_tokens": 463418898.0, + "step": 17914 + }, + { + "epoch": 1.9673841423237426, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5120606422424316, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7031644582748413, + "num_tokens": 463442296.0, + "step": 17915 + }, + { + "epoch": 1.9674939600263563, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.179487705230713, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7054042220115662, + "num_tokens": 463470572.0, + "step": 17916 + }, + { + "epoch": 1.9676037777289699, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.264875888824463, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7039555907249451, + "num_tokens": 463501542.0, + "step": 17917 + }, + { + "epoch": 1.9677135954315834, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4364001750946045, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6923022866249084, + "num_tokens": 463528562.0, + "step": 17918 + }, + { + "epoch": 1.9678234131341972, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.333266258239746, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7131037712097168, + "num_tokens": 463557046.0, + "step": 17919 + }, + { + "epoch": 1.967933230836811, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5100090503692627, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7137150764465332, + "num_tokens": 463581800.0, + "step": 17920 + }, + { + "epoch": 1.9680430485394247, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.381967782974243, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7142843008041382, + "num_tokens": 463607472.0, + "step": 17921 + }, + { + "epoch": 1.9681528662420382, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.243777275085449, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6894270181655884, + "num_tokens": 463635685.0, + "step": 17922 + }, + { + "epoch": 1.9682626839446518, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2414586544036865, + "learning_rate": 1e-06, + "loss": 1.1108, + "mean_token_accuracy": 0.6862192749977112, + "num_tokens": 463667783.0, + "step": 17923 + }, + { + "epoch": 1.9683725016472655, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1915035247802734, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7088501453399658, + "num_tokens": 463699725.0, + "step": 17924 + }, + { + "epoch": 1.9684823193498793, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4724745750427246, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7268729209899902, + "num_tokens": 463721407.0, + "step": 17925 + }, + { + "epoch": 1.9685921370524928, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.364494800567627, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.7002700567245483, + "num_tokens": 463749097.0, + "step": 17926 + }, + { + "epoch": 1.9687019547551066, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1805431842803955, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7148216962814331, + "num_tokens": 463777892.0, + "step": 17927 + }, + { + "epoch": 1.96881177245772, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3550195693969727, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.714429497718811, + "num_tokens": 463802494.0, + "step": 17928 + }, + { + "epoch": 1.9689215901603339, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3495430946350098, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7025489211082458, + "num_tokens": 463829345.0, + "step": 17929 + }, + { + "epoch": 1.9690314078629476, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.517376661300659, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7031850814819336, + "num_tokens": 463852750.0, + "step": 17930 + }, + { + "epoch": 1.9691412255655611, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.383296251296997, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7268093228340149, + "num_tokens": 463876341.0, + "step": 17931 + }, + { + "epoch": 1.9692510432681747, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.329160213470459, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6857050657272339, + "num_tokens": 463903977.0, + "step": 17932 + }, + { + "epoch": 1.9693608609707884, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1430184841156006, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6970614790916443, + "num_tokens": 463935305.0, + "step": 17933 + }, + { + "epoch": 1.9694706786734022, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.260749578475952, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7141367197036743, + "num_tokens": 463964995.0, + "step": 17934 + }, + { + "epoch": 1.969580496376016, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.22379207611084, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7057321071624756, + "num_tokens": 463995158.0, + "step": 17935 + }, + { + "epoch": 1.9696903140786295, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2796707153320312, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.708091676235199, + "num_tokens": 464022617.0, + "step": 17936 + }, + { + "epoch": 1.969800131781243, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5699305534362793, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7191532850265503, + "num_tokens": 464044568.0, + "step": 17937 + }, + { + "epoch": 1.9699099494838568, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5179824829101562, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7227424383163452, + "num_tokens": 464066711.0, + "step": 17938 + }, + { + "epoch": 1.9700197671864705, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1766817569732666, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.7032288908958435, + "num_tokens": 464096915.0, + "step": 17939 + }, + { + "epoch": 1.970129584889084, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.366873264312744, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7076053023338318, + "num_tokens": 464123541.0, + "step": 17940 + }, + { + "epoch": 1.9702394025916978, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.255255937576294, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7029455304145813, + "num_tokens": 464151300.0, + "step": 17941 + }, + { + "epoch": 1.9703492202943114, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2572877407073975, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7020571231842041, + "num_tokens": 464178356.0, + "step": 17942 + }, + { + "epoch": 1.9704590379969251, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.356457233428955, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6840507984161377, + "num_tokens": 464206079.0, + "step": 17943 + }, + { + "epoch": 1.9705688556995389, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1101114749908447, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6807073354721069, + "num_tokens": 464238899.0, + "step": 17944 + }, + { + "epoch": 1.9706786734021524, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.210820436477661, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.7000386118888855, + "num_tokens": 464269439.0, + "step": 17945 + }, + { + "epoch": 1.970788491104766, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.471254348754883, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7181187272071838, + "num_tokens": 464292768.0, + "step": 17946 + }, + { + "epoch": 1.9708983088073797, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2100322246551514, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7085682153701782, + "num_tokens": 464322796.0, + "step": 17947 + }, + { + "epoch": 1.9710081265099935, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1325137615203857, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7081338167190552, + "num_tokens": 464353679.0, + "step": 17948 + }, + { + "epoch": 1.9711179442126072, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.8376102447509766, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7187244296073914, + "num_tokens": 464372732.0, + "step": 17949 + }, + { + "epoch": 1.9712277619152208, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.018383741378784, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7235109210014343, + "num_tokens": 464405373.0, + "step": 17950 + }, + { + "epoch": 1.9713375796178343, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4261667728424072, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7018072009086609, + "num_tokens": 464430443.0, + "step": 17951 + }, + { + "epoch": 1.971447397320448, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2434942722320557, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6919913291931152, + "num_tokens": 464457515.0, + "step": 17952 + }, + { + "epoch": 1.9715572150230618, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.846872329711914, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7053256034851074, + "num_tokens": 464477028.0, + "step": 17953 + }, + { + "epoch": 1.9716670327256753, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3784520626068115, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.706561803817749, + "num_tokens": 464502140.0, + "step": 17954 + }, + { + "epoch": 1.9717768504282889, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2797329425811768, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7168947458267212, + "num_tokens": 464530992.0, + "step": 17955 + }, + { + "epoch": 1.9718866681309026, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1613311767578125, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6805612444877625, + "num_tokens": 464565271.0, + "step": 17956 + }, + { + "epoch": 1.9719964858335164, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2480475902557373, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7375670671463013, + "num_tokens": 464592186.0, + "step": 17957 + }, + { + "epoch": 1.9721063035361301, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.312448501586914, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7187280654907227, + "num_tokens": 464616573.0, + "step": 17958 + }, + { + "epoch": 1.9722161212387437, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.419466495513916, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6931437253952026, + "num_tokens": 464641020.0, + "step": 17959 + }, + { + "epoch": 1.9723259389413572, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1854186058044434, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7274136543273926, + "num_tokens": 464671457.0, + "step": 17960 + }, + { + "epoch": 1.972435756643971, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.198484420776367, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7252202033996582, + "num_tokens": 464701479.0, + "step": 17961 + }, + { + "epoch": 1.9725455743465847, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2360122203826904, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6991016864776611, + "num_tokens": 464730943.0, + "step": 17962 + }, + { + "epoch": 1.9726553920491985, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6277716159820557, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7010606527328491, + "num_tokens": 464753566.0, + "step": 17963 + }, + { + "epoch": 1.972765209751812, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3719215393066406, + "learning_rate": 1e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.6891328692436218, + "num_tokens": 464781410.0, + "step": 17964 + }, + { + "epoch": 1.9728750274544256, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5292134284973145, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7159813642501831, + "num_tokens": 464806484.0, + "step": 17965 + }, + { + "epoch": 1.9729848451570393, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3886606693267822, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7050355672836304, + "num_tokens": 464833348.0, + "step": 17966 + }, + { + "epoch": 1.973094662859653, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.8108670711517334, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.69739830493927, + "num_tokens": 464856986.0, + "step": 17967 + }, + { + "epoch": 1.9732044805622666, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.097350597381592, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7359155416488647, + "num_tokens": 464887986.0, + "step": 17968 + }, + { + "epoch": 1.9733142982648801, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7576990127563477, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7235257625579834, + "num_tokens": 464907957.0, + "step": 17969 + }, + { + "epoch": 1.973424115967494, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2441649436950684, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.727773904800415, + "num_tokens": 464933222.0, + "step": 17970 + }, + { + "epoch": 1.9735339336701077, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.391313076019287, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7057870626449585, + "num_tokens": 464959829.0, + "step": 17971 + }, + { + "epoch": 1.9736437513727214, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.416375160217285, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7163255214691162, + "num_tokens": 464983039.0, + "step": 17972 + }, + { + "epoch": 1.973753569075335, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2827000617980957, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6914721727371216, + "num_tokens": 465012388.0, + "step": 17973 + }, + { + "epoch": 1.9738633867779485, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.222532272338867, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6967077255249023, + "num_tokens": 465042863.0, + "step": 17974 + }, + { + "epoch": 1.9739732044805622, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.361920118331909, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.719091534614563, + "num_tokens": 465066140.0, + "step": 17975 + }, + { + "epoch": 1.974083022183176, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3084781169891357, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7205384373664856, + "num_tokens": 465091795.0, + "step": 17976 + }, + { + "epoch": 1.9741928398857898, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.512526750564575, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7244628667831421, + "num_tokens": 465115843.0, + "step": 17977 + }, + { + "epoch": 1.9743026575884033, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1992907524108887, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7084605693817139, + "num_tokens": 465143631.0, + "step": 17978 + }, + { + "epoch": 1.9744124752910168, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3549506664276123, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6872757077217102, + "num_tokens": 465170006.0, + "step": 17979 + }, + { + "epoch": 1.9745222929936306, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.289950370788574, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.711179256439209, + "num_tokens": 465196759.0, + "step": 17980 + }, + { + "epoch": 1.9746321106962443, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2591545581817627, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7389896512031555, + "num_tokens": 465223186.0, + "step": 17981 + }, + { + "epoch": 1.9747419283988579, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4979801177978516, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7082372903823853, + "num_tokens": 465244970.0, + "step": 17982 + }, + { + "epoch": 1.9748517461014714, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4385979175567627, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.717607855796814, + "num_tokens": 465267666.0, + "step": 17983 + }, + { + "epoch": 1.9749615638040852, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.132939577102661, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6860750317573547, + "num_tokens": 465300502.0, + "step": 17984 + }, + { + "epoch": 1.975071381506699, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.331137180328369, + "learning_rate": 1e-06, + "loss": 1.0969, + "mean_token_accuracy": 0.6801162958145142, + "num_tokens": 465330538.0, + "step": 17985 + }, + { + "epoch": 1.9751811992093127, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3364524841308594, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7116883993148804, + "num_tokens": 465356579.0, + "step": 17986 + }, + { + "epoch": 1.9752910169119262, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.743168354034424, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7245160341262817, + "num_tokens": 465375455.0, + "step": 17987 + }, + { + "epoch": 1.9754008346145397, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1781647205352783, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6777125597000122, + "num_tokens": 465406089.0, + "step": 17988 + }, + { + "epoch": 1.9755106523171535, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5951342582702637, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7306280136108398, + "num_tokens": 465428073.0, + "step": 17989 + }, + { + "epoch": 1.9756204700197673, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.320655345916748, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7055627703666687, + "num_tokens": 465454586.0, + "step": 17990 + }, + { + "epoch": 1.9757302877223808, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.294374942779541, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7098480463027954, + "num_tokens": 465479020.0, + "step": 17991 + }, + { + "epoch": 1.9758401054249946, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.620838165283203, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7192238569259644, + "num_tokens": 465499620.0, + "step": 17992 + }, + { + "epoch": 1.975949923127608, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5157692432403564, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7207369804382324, + "num_tokens": 465523489.0, + "step": 17993 + }, + { + "epoch": 1.9760597408302218, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.485724449157715, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7180346250534058, + "num_tokens": 465550394.0, + "step": 17994 + }, + { + "epoch": 1.9761695585328356, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.454308032989502, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7261430025100708, + "num_tokens": 465574074.0, + "step": 17995 + }, + { + "epoch": 1.9762793762354491, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.9468451738357544, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7285306453704834, + "num_tokens": 465610552.0, + "step": 17996 + }, + { + "epoch": 1.9763891939380627, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.300349473953247, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6964539289474487, + "num_tokens": 465638687.0, + "step": 17997 + }, + { + "epoch": 1.9764990116406764, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.7659056186676025, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7116831541061401, + "num_tokens": 465659469.0, + "step": 17998 + }, + { + "epoch": 1.9766088293432902, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.107553005218506, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7077615857124329, + "num_tokens": 465690690.0, + "step": 17999 + }, + { + "epoch": 1.976718647045904, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.564408540725708, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7233988642692566, + "num_tokens": 465713521.0, + "step": 18000 + }, + { + "epoch": 1.9768284647485175, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2370307445526123, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6990237236022949, + "num_tokens": 465741454.0, + "step": 18001 + }, + { + "epoch": 1.976938282451131, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.348158359527588, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7103215456008911, + "num_tokens": 465767905.0, + "step": 18002 + }, + { + "epoch": 1.9770481001537448, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2643463611602783, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6824460029602051, + "num_tokens": 465795347.0, + "step": 18003 + }, + { + "epoch": 1.9771579178563585, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.9502339363098145, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7197954058647156, + "num_tokens": 465812978.0, + "step": 18004 + }, + { + "epoch": 1.977267735558972, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.349062204360962, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7032473087310791, + "num_tokens": 465838833.0, + "step": 18005 + }, + { + "epoch": 1.9773775532615858, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.368497371673584, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7282212972640991, + "num_tokens": 465865017.0, + "step": 18006 + }, + { + "epoch": 1.9774873709641994, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5175836086273193, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7006909847259521, + "num_tokens": 465887636.0, + "step": 18007 + }, + { + "epoch": 1.977597188666813, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 32.5795783996582, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6917843222618103, + "num_tokens": 465917893.0, + "step": 18008 + }, + { + "epoch": 1.9777070063694269, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2103543281555176, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7177022099494934, + "num_tokens": 465946956.0, + "step": 18009 + }, + { + "epoch": 1.9778168240720404, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4939823150634766, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7164162993431091, + "num_tokens": 465971921.0, + "step": 18010 + }, + { + "epoch": 1.977926641774654, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3491947650909424, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7085916996002197, + "num_tokens": 465998156.0, + "step": 18011 + }, + { + "epoch": 1.9780364594772677, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.115551471710205, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7053004503250122, + "num_tokens": 466029346.0, + "step": 18012 + }, + { + "epoch": 1.9781462771798815, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.155466079711914, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7194453477859497, + "num_tokens": 466059698.0, + "step": 18013 + }, + { + "epoch": 1.9782560948824952, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.466524362564087, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6918164491653442, + "num_tokens": 466084530.0, + "step": 18014 + }, + { + "epoch": 1.9783659125851087, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.065800666809082, + "learning_rate": 1e-06, + "loss": 1.0812, + "mean_token_accuracy": 0.6845784783363342, + "num_tokens": 466118536.0, + "step": 18015 + }, + { + "epoch": 1.9784757302877223, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4430434703826904, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7200174331665039, + "num_tokens": 466142907.0, + "step": 18016 + }, + { + "epoch": 1.978585547990336, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1343483924865723, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7148253917694092, + "num_tokens": 466175850.0, + "step": 18017 + }, + { + "epoch": 1.9786953656929498, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.449685573577881, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6909341812133789, + "num_tokens": 466201036.0, + "step": 18018 + }, + { + "epoch": 1.9788051833955633, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5716354846954346, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.720203161239624, + "num_tokens": 466221700.0, + "step": 18019 + }, + { + "epoch": 1.9789150010981769, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.717947244644165, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7125107049942017, + "num_tokens": 466241325.0, + "step": 18020 + }, + { + "epoch": 1.9790248188007906, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.113886833190918, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7029649019241333, + "num_tokens": 466272510.0, + "step": 18021 + }, + { + "epoch": 1.9791346365034044, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2995967864990234, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7092292308807373, + "num_tokens": 466297912.0, + "step": 18022 + }, + { + "epoch": 1.9792444542060181, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3936750888824463, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7056611180305481, + "num_tokens": 466324169.0, + "step": 18023 + }, + { + "epoch": 1.9793542719086317, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3294477462768555, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7028112411499023, + "num_tokens": 466351938.0, + "step": 18024 + }, + { + "epoch": 1.9794640896112452, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2684104442596436, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7006173133850098, + "num_tokens": 466381266.0, + "step": 18025 + }, + { + "epoch": 1.979573907313859, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2114250659942627, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7215895652770996, + "num_tokens": 466408555.0, + "step": 18026 + }, + { + "epoch": 1.9796837250164727, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.565669059753418, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7091077566146851, + "num_tokens": 466429737.0, + "step": 18027 + }, + { + "epoch": 1.9797935427190865, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.367548704147339, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7126747369766235, + "num_tokens": 466454995.0, + "step": 18028 + }, + { + "epoch": 1.9799033604217, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.237387180328369, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6965670585632324, + "num_tokens": 466485326.0, + "step": 18029 + }, + { + "epoch": 1.9800131781243135, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1033668518066406, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6909464001655579, + "num_tokens": 466517105.0, + "step": 18030 + }, + { + "epoch": 1.9801229958269273, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.414095401763916, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.722939670085907, + "num_tokens": 466542822.0, + "step": 18031 + }, + { + "epoch": 1.980232813529541, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.580118417739868, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7156950831413269, + "num_tokens": 466566406.0, + "step": 18032 + }, + { + "epoch": 1.9803426312321546, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.342574119567871, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7086241245269775, + "num_tokens": 466594203.0, + "step": 18033 + }, + { + "epoch": 1.9804524489347681, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2098255157470703, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7053776979446411, + "num_tokens": 466624685.0, + "step": 18034 + }, + { + "epoch": 1.9805622666373819, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2404136657714844, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.71152663230896, + "num_tokens": 466652801.0, + "step": 18035 + }, + { + "epoch": 1.9806720843399956, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.388901710510254, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7096576690673828, + "num_tokens": 466678910.0, + "step": 18036 + }, + { + "epoch": 1.9807819020426094, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3764686584472656, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7153586745262146, + "num_tokens": 466707742.0, + "step": 18037 + }, + { + "epoch": 1.980891719745223, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.471055030822754, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7002295255661011, + "num_tokens": 466730741.0, + "step": 18038 + }, + { + "epoch": 1.9810015374478365, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.308523416519165, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.70212322473526, + "num_tokens": 466757928.0, + "step": 18039 + }, + { + "epoch": 1.9811113551504502, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.481372833251953, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7052806615829468, + "num_tokens": 466781909.0, + "step": 18040 + }, + { + "epoch": 1.981221172853064, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.481095314025879, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.700177013874054, + "num_tokens": 466807173.0, + "step": 18041 + }, + { + "epoch": 1.9813309905556777, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4860105514526367, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7212173938751221, + "num_tokens": 466829869.0, + "step": 18042 + }, + { + "epoch": 1.9814408082582913, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.20831298828125, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7168521881103516, + "num_tokens": 466857696.0, + "step": 18043 + }, + { + "epoch": 1.9815506259609048, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4937775135040283, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7250016331672668, + "num_tokens": 466879323.0, + "step": 18044 + }, + { + "epoch": 1.9816604436635186, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.382150411605835, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.708301305770874, + "num_tokens": 466904558.0, + "step": 18045 + }, + { + "epoch": 1.9817702613661323, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.750359058380127, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7119805812835693, + "num_tokens": 466924408.0, + "step": 18046 + }, + { + "epoch": 1.9818800790687459, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.500305652618408, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.737099289894104, + "num_tokens": 466945415.0, + "step": 18047 + }, + { + "epoch": 1.9819898967713594, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3829410076141357, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7132216691970825, + "num_tokens": 466971936.0, + "step": 18048 + }, + { + "epoch": 1.9820997144739732, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3865175247192383, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.7035461664199829, + "num_tokens": 466999690.0, + "step": 18049 + }, + { + "epoch": 1.982209532176587, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.801100015640259, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7350172400474548, + "num_tokens": 467019527.0, + "step": 18050 + }, + { + "epoch": 1.9823193498792007, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.208282709121704, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7193419933319092, + "num_tokens": 467047899.0, + "step": 18051 + }, + { + "epoch": 1.9824291675818142, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5718865394592285, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7217763662338257, + "num_tokens": 467069050.0, + "step": 18052 + }, + { + "epoch": 1.9825389852844277, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.446136951446533, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7323753833770752, + "num_tokens": 467091624.0, + "step": 18053 + }, + { + "epoch": 1.9826488029870415, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3293371200561523, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6879327893257141, + "num_tokens": 467119004.0, + "step": 18054 + }, + { + "epoch": 1.9827586206896552, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2200779914855957, + "learning_rate": 1e-06, + "loss": 1.1032, + "mean_token_accuracy": 0.6774351596832275, + "num_tokens": 467150163.0, + "step": 18055 + }, + { + "epoch": 1.9828684383922688, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1280086040496826, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.733335554599762, + "num_tokens": 467180353.0, + "step": 18056 + }, + { + "epoch": 1.9829782560948825, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.472782611846924, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7449889183044434, + "num_tokens": 467200651.0, + "step": 18057 + }, + { + "epoch": 1.983088073797496, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3618130683898926, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7287307977676392, + "num_tokens": 467225799.0, + "step": 18058 + }, + { + "epoch": 1.9831978915001098, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.94006609916687, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.718483567237854, + "num_tokens": 467243075.0, + "step": 18059 + }, + { + "epoch": 1.9833077092027236, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.270787477493286, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.688006579875946, + "num_tokens": 467272737.0, + "step": 18060 + }, + { + "epoch": 1.9834175269053371, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.0633835792541504, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7254180908203125, + "num_tokens": 467301245.0, + "step": 18061 + }, + { + "epoch": 1.9835273446079507, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.154947519302368, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6966598033905029, + "num_tokens": 467331176.0, + "step": 18062 + }, + { + "epoch": 1.9836371623105644, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.663553237915039, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7362070679664612, + "num_tokens": 467350422.0, + "step": 18063 + }, + { + "epoch": 1.9837469800131782, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.256758213043213, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7225593328475952, + "num_tokens": 467377058.0, + "step": 18064 + }, + { + "epoch": 1.983856797715792, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2937848567962646, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6917547583580017, + "num_tokens": 467405378.0, + "step": 18065 + }, + { + "epoch": 1.9839666154184055, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.295193910598755, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7078248262405396, + "num_tokens": 467433938.0, + "step": 18066 + }, + { + "epoch": 1.984076433121019, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5388576984405518, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6899802088737488, + "num_tokens": 467456400.0, + "step": 18067 + }, + { + "epoch": 1.9841862508236328, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.246586322784424, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7022774815559387, + "num_tokens": 467485415.0, + "step": 18068 + }, + { + "epoch": 1.9842960685262465, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1624879837036133, + "learning_rate": 1e-06, + "loss": 1.1046, + "mean_token_accuracy": 0.6805396676063538, + "num_tokens": 467516613.0, + "step": 18069 + }, + { + "epoch": 1.98440588622886, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.34433650970459, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7139922380447388, + "num_tokens": 467541554.0, + "step": 18070 + }, + { + "epoch": 1.9845157039314738, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.612967014312744, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7267120480537415, + "num_tokens": 467562551.0, + "step": 18071 + }, + { + "epoch": 1.9846255216340873, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.498918056488037, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7053912878036499, + "num_tokens": 467587875.0, + "step": 18072 + }, + { + "epoch": 1.984735339336701, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.522407293319702, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7270500659942627, + "num_tokens": 467609473.0, + "step": 18073 + }, + { + "epoch": 1.9848451570393149, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1695680618286133, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.72286057472229, + "num_tokens": 467638318.0, + "step": 18074 + }, + { + "epoch": 1.9849549747419284, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.198915719985962, + "learning_rate": 1e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6851418018341064, + "num_tokens": 467669113.0, + "step": 18075 + }, + { + "epoch": 1.985064792444542, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.50831937789917, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6909666061401367, + "num_tokens": 467695282.0, + "step": 18076 + }, + { + "epoch": 1.9851746101471557, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5210015773773193, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7138403654098511, + "num_tokens": 467718225.0, + "step": 18077 + }, + { + "epoch": 1.9852844278497694, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.636697769165039, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6905112862586975, + "num_tokens": 467741915.0, + "step": 18078 + }, + { + "epoch": 1.9853942455523832, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.574867010116577, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7197749018669128, + "num_tokens": 467762792.0, + "step": 18079 + }, + { + "epoch": 1.9855040632549967, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6422855854034424, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7096726894378662, + "num_tokens": 467784467.0, + "step": 18080 + }, + { + "epoch": 1.9856138809576103, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.438260078430176, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.721799373626709, + "num_tokens": 467809037.0, + "step": 18081 + }, + { + "epoch": 1.985723698660224, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4947421550750732, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7237525582313538, + "num_tokens": 467833099.0, + "step": 18082 + }, + { + "epoch": 1.9858335163628378, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.325448513031006, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.7045819163322449, + "num_tokens": 467861147.0, + "step": 18083 + }, + { + "epoch": 1.9859433340654513, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.520540714263916, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.701400876045227, + "num_tokens": 467886383.0, + "step": 18084 + }, + { + "epoch": 1.9860531517680649, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.783970355987549, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7352156043052673, + "num_tokens": 467904883.0, + "step": 18085 + }, + { + "epoch": 1.9861629694706786, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6337006092071533, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.7109920978546143, + "num_tokens": 467927722.0, + "step": 18086 + }, + { + "epoch": 1.9862727871732924, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4382715225219727, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7312444448471069, + "num_tokens": 467954533.0, + "step": 18087 + }, + { + "epoch": 1.9863826048759061, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2532665729522705, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7391097545623779, + "num_tokens": 467980580.0, + "step": 18088 + }, + { + "epoch": 1.9864924225785197, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6690165996551514, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7306994199752808, + "num_tokens": 468001046.0, + "step": 18089 + }, + { + "epoch": 1.9866022402811332, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.599217414855957, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7269701957702637, + "num_tokens": 468022546.0, + "step": 18090 + }, + { + "epoch": 1.986712057983747, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5207183361053467, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7093017101287842, + "num_tokens": 468045996.0, + "step": 18091 + }, + { + "epoch": 1.9868218756863607, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.322591781616211, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7288234233856201, + "num_tokens": 468073492.0, + "step": 18092 + }, + { + "epoch": 1.9869316933889745, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.561882495880127, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6980053782463074, + "num_tokens": 468097229.0, + "step": 18093 + }, + { + "epoch": 1.987041511091588, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2684171199798584, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6921413540840149, + "num_tokens": 468124449.0, + "step": 18094 + }, + { + "epoch": 1.9871513287942015, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3048980236053467, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6925824880599976, + "num_tokens": 468153002.0, + "step": 18095 + }, + { + "epoch": 1.9872611464968153, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.587775707244873, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7175443172454834, + "num_tokens": 468175728.0, + "step": 18096 + }, + { + "epoch": 1.987370964199429, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1481704711914062, + "learning_rate": 1e-06, + "loss": 1.0867, + "mean_token_accuracy": 0.681641161441803, + "num_tokens": 468208067.0, + "step": 18097 + }, + { + "epoch": 1.9874807819020426, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.593575954437256, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7261072397232056, + "num_tokens": 468229495.0, + "step": 18098 + }, + { + "epoch": 1.9875905996046561, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 3.0323634147644043, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7211692333221436, + "num_tokens": 468256501.0, + "step": 18099 + }, + { + "epoch": 1.9877004173072699, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.491032123565674, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7294883728027344, + "num_tokens": 468280381.0, + "step": 18100 + }, + { + "epoch": 1.9878102350098836, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.09128475189209, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7026216387748718, + "num_tokens": 468311940.0, + "step": 18101 + }, + { + "epoch": 1.9879200527124974, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.552009344100952, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7113094329833984, + "num_tokens": 468335383.0, + "step": 18102 + }, + { + "epoch": 1.988029870415111, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6094536781311035, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6968456506729126, + "num_tokens": 468358251.0, + "step": 18103 + }, + { + "epoch": 1.9881396881177245, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4752707481384277, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7385474443435669, + "num_tokens": 468381560.0, + "step": 18104 + }, + { + "epoch": 1.9882495058203382, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.606694221496582, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.696683406829834, + "num_tokens": 468405235.0, + "step": 18105 + }, + { + "epoch": 1.988359323522952, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2899906635284424, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7177928686141968, + "num_tokens": 468433779.0, + "step": 18106 + }, + { + "epoch": 1.9884691412255655, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.218344211578369, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7236417531967163, + "num_tokens": 468465278.0, + "step": 18107 + }, + { + "epoch": 1.9885789589281793, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5879440307617188, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7264648675918579, + "num_tokens": 468489088.0, + "step": 18108 + }, + { + "epoch": 1.9886887766307928, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1889302730560303, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7004973888397217, + "num_tokens": 468521256.0, + "step": 18109 + }, + { + "epoch": 1.9887985943334066, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.621628999710083, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7265756130218506, + "num_tokens": 468541124.0, + "step": 18110 + }, + { + "epoch": 1.9889084120360203, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4844107627868652, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7135772705078125, + "num_tokens": 468565961.0, + "step": 18111 + }, + { + "epoch": 1.9890182297386338, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1598448753356934, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7012628316879272, + "num_tokens": 468595579.0, + "step": 18112 + }, + { + "epoch": 1.9891280474412474, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3668408393859863, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7157909870147705, + "num_tokens": 468621118.0, + "step": 18113 + }, + { + "epoch": 1.9892378651438611, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6496310234069824, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7209769487380981, + "num_tokens": 468641545.0, + "step": 18114 + }, + { + "epoch": 1.989347682846475, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2821943759918213, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7087469100952148, + "num_tokens": 468668558.0, + "step": 18115 + }, + { + "epoch": 1.9894575005490887, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2650887966156006, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6895014047622681, + "num_tokens": 468697598.0, + "step": 18116 + }, + { + "epoch": 1.9895673182517022, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2051024436950684, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6985626816749573, + "num_tokens": 468726401.0, + "step": 18117 + }, + { + "epoch": 1.9896771359543157, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.650184154510498, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6976354122161865, + "num_tokens": 468748943.0, + "step": 18118 + }, + { + "epoch": 1.9897869536569295, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1582179069519043, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.707918107509613, + "num_tokens": 468777685.0, + "step": 18119 + }, + { + "epoch": 1.9898967713595432, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.556452989578247, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7160091400146484, + "num_tokens": 468800301.0, + "step": 18120 + }, + { + "epoch": 1.9900065890621568, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.299612283706665, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7152889966964722, + "num_tokens": 468826461.0, + "step": 18121 + }, + { + "epoch": 1.9901164067647705, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.118549346923828, + "learning_rate": 1e-06, + "loss": 1.1285, + "mean_token_accuracy": 0.6864776611328125, + "num_tokens": 468858553.0, + "step": 18122 + }, + { + "epoch": 1.990226224467384, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.205782890319824, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6892744898796082, + "num_tokens": 468888908.0, + "step": 18123 + }, + { + "epoch": 1.9903360421699978, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.050084114074707, + "learning_rate": 1e-06, + "loss": 1.0801, + "mean_token_accuracy": 0.6810991764068604, + "num_tokens": 468922491.0, + "step": 18124 + }, + { + "epoch": 1.9904458598726116, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.086442232131958, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7012462019920349, + "num_tokens": 468954283.0, + "step": 18125 + }, + { + "epoch": 1.9905556775752251, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5111749172210693, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7026041746139526, + "num_tokens": 468977253.0, + "step": 18126 + }, + { + "epoch": 1.9906654952778386, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.589195966720581, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7203561663627625, + "num_tokens": 468997728.0, + "step": 18127 + }, + { + "epoch": 1.9907753129804524, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.370758295059204, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7146592140197754, + "num_tokens": 469022017.0, + "step": 18128 + }, + { + "epoch": 1.9908851306830662, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3735249042510986, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7139180898666382, + "num_tokens": 469045797.0, + "step": 18129 + }, + { + "epoch": 1.99099494838568, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.273061990737915, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6872392892837524, + "num_tokens": 469077174.0, + "step": 18130 + }, + { + "epoch": 1.9911047660882935, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.288102626800537, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6833642721176147, + "num_tokens": 469104610.0, + "step": 18131 + }, + { + "epoch": 1.991214583790907, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1018311977386475, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7032973170280457, + "num_tokens": 469136746.0, + "step": 18132 + }, + { + "epoch": 1.9913244014935207, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.136409282684326, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6997485756874084, + "num_tokens": 469167149.0, + "step": 18133 + }, + { + "epoch": 1.9914342191961345, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1627230644226074, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7032530307769775, + "num_tokens": 469194948.0, + "step": 18134 + }, + { + "epoch": 1.991544036898748, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.279785633087158, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7064309120178223, + "num_tokens": 469220938.0, + "step": 18135 + }, + { + "epoch": 1.9916538546013618, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0972766876220703, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7151932716369629, + "num_tokens": 469257117.0, + "step": 18136 + }, + { + "epoch": 1.9917636723039753, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2881152629852295, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7103487849235535, + "num_tokens": 469284383.0, + "step": 18137 + }, + { + "epoch": 1.991873490006589, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.437861204147339, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7158150672912598, + "num_tokens": 469307692.0, + "step": 18138 + }, + { + "epoch": 1.9919833077092028, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.5197198390960693, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7064371705055237, + "num_tokens": 469330147.0, + "step": 18139 + }, + { + "epoch": 1.9920931254118164, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3357772827148438, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7146052122116089, + "num_tokens": 469355986.0, + "step": 18140 + }, + { + "epoch": 1.99220294311443, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.313732624053955, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7136046886444092, + "num_tokens": 469384324.0, + "step": 18141 + }, + { + "epoch": 1.9923127608170437, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1080031394958496, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7165591716766357, + "num_tokens": 469414795.0, + "step": 18142 + }, + { + "epoch": 1.9924225785196574, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.512819290161133, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6987898945808411, + "num_tokens": 469437569.0, + "step": 18143 + }, + { + "epoch": 1.9925323962222712, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4384829998016357, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7319019436836243, + "num_tokens": 469461788.0, + "step": 18144 + }, + { + "epoch": 1.9926422139248847, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.308107614517212, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7013611197471619, + "num_tokens": 469489058.0, + "step": 18145 + }, + { + "epoch": 1.9927520316274983, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2063963413238525, + "learning_rate": 1e-06, + "loss": 1.091, + "mean_token_accuracy": 0.6836487650871277, + "num_tokens": 469518606.0, + "step": 18146 + }, + { + "epoch": 1.992861849330112, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.446342945098877, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7134008407592773, + "num_tokens": 469542224.0, + "step": 18147 + }, + { + "epoch": 1.9929716670327258, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.191287040710449, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.706188440322876, + "num_tokens": 469571020.0, + "step": 18148 + }, + { + "epoch": 1.9930814847353393, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.313805341720581, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7170670032501221, + "num_tokens": 469598688.0, + "step": 18149 + }, + { + "epoch": 1.9931913024379528, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.139911651611328, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7169039845466614, + "num_tokens": 469628287.0, + "step": 18150 + }, + { + "epoch": 1.9933011201405666, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.112745761871338, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6986855268478394, + "num_tokens": 469661848.0, + "step": 18151 + }, + { + "epoch": 1.9934109378431804, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.14148211479187, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7187081575393677, + "num_tokens": 469691358.0, + "step": 18152 + }, + { + "epoch": 1.993520755545794, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.305001974105835, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7131732702255249, + "num_tokens": 469719859.0, + "step": 18153 + }, + { + "epoch": 1.9936305732484076, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4097537994384766, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6968568563461304, + "num_tokens": 469744208.0, + "step": 18154 + }, + { + "epoch": 1.9937403909510212, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3663086891174316, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6950594186782837, + "num_tokens": 469775148.0, + "step": 18155 + }, + { + "epoch": 1.993850208653635, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.733753204345703, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7434267401695251, + "num_tokens": 469796709.0, + "step": 18156 + }, + { + "epoch": 1.9939600263562487, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.651384115219116, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7075141072273254, + "num_tokens": 469819795.0, + "step": 18157 + }, + { + "epoch": 1.9940698440588625, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.413081169128418, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7330770492553711, + "num_tokens": 469844011.0, + "step": 18158 + }, + { + "epoch": 1.994179661761476, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.455498456954956, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7080484628677368, + "num_tokens": 469867553.0, + "step": 18159 + }, + { + "epoch": 1.9942894794640895, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3738327026367188, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7169506549835205, + "num_tokens": 469896905.0, + "step": 18160 + }, + { + "epoch": 1.9943992971667033, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.654449462890625, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6938951015472412, + "num_tokens": 469920200.0, + "step": 18161 + }, + { + "epoch": 1.994509114869317, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.542335271835327, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7101559638977051, + "num_tokens": 469944573.0, + "step": 18162 + }, + { + "epoch": 1.9946189325719306, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4076437950134277, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7102606296539307, + "num_tokens": 469967220.0, + "step": 18163 + }, + { + "epoch": 1.994728750274544, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6393308639526367, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7198764085769653, + "num_tokens": 469987942.0, + "step": 18164 + }, + { + "epoch": 1.9948385679771579, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2699501514434814, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7107154726982117, + "num_tokens": 470017012.0, + "step": 18165 + }, + { + "epoch": 1.9949483856797716, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3204703330993652, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7159008383750916, + "num_tokens": 470042843.0, + "step": 18166 + }, + { + "epoch": 1.9950582033823854, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.344931125640869, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7348830103874207, + "num_tokens": 470069002.0, + "step": 18167 + }, + { + "epoch": 1.995168021084999, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.572955846786499, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.723529577255249, + "num_tokens": 470089815.0, + "step": 18168 + }, + { + "epoch": 1.9952778387876124, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.148813009262085, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7002487778663635, + "num_tokens": 470121345.0, + "step": 18169 + }, + { + "epoch": 1.9953876564902262, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2845816612243652, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7232726812362671, + "num_tokens": 470148531.0, + "step": 18170 + }, + { + "epoch": 1.99549747419284, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3988871574401855, + "learning_rate": 1e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6897779703140259, + "num_tokens": 470174290.0, + "step": 18171 + }, + { + "epoch": 1.9956072918954535, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2578651905059814, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.7055572867393494, + "num_tokens": 470201166.0, + "step": 18172 + }, + { + "epoch": 1.9957171095980673, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.622312068939209, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7111519575119019, + "num_tokens": 470222186.0, + "step": 18173 + }, + { + "epoch": 1.9958269273006808, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1520867347717285, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.725028395652771, + "num_tokens": 470253168.0, + "step": 18174 + }, + { + "epoch": 1.9959367450032945, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.3429086208343506, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.708467960357666, + "num_tokens": 470281135.0, + "step": 18175 + }, + { + "epoch": 1.9960465627059083, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.362562656402588, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.7020198106765747, + "num_tokens": 470307947.0, + "step": 18176 + }, + { + "epoch": 1.9961563804085218, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.163041353225708, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7431744337081909, + "num_tokens": 470335331.0, + "step": 18177 + }, + { + "epoch": 1.9962661981111354, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1661176681518555, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7012332677841187, + "num_tokens": 470365413.0, + "step": 18178 + }, + { + "epoch": 1.9963760158137491, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.1884703636169434, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.694684624671936, + "num_tokens": 470396737.0, + "step": 18179 + }, + { + "epoch": 1.9964858335163629, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.422804355621338, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7025547027587891, + "num_tokens": 470422654.0, + "step": 18180 + }, + { + "epoch": 1.9965956512189766, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.34415340423584, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7164683938026428, + "num_tokens": 470449058.0, + "step": 18181 + }, + { + "epoch": 1.9967054689215902, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.477725028991699, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7312148213386536, + "num_tokens": 470471733.0, + "step": 18182 + }, + { + "epoch": 1.9968152866242037, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.6054563522338867, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7071665525436401, + "num_tokens": 470494067.0, + "step": 18183 + }, + { + "epoch": 1.9969251043268175, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.336557388305664, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7065726518630981, + "num_tokens": 470521583.0, + "step": 18184 + }, + { + "epoch": 1.9970349220294312, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4314239025115967, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.707754909992218, + "num_tokens": 470545232.0, + "step": 18185 + }, + { + "epoch": 1.9971447397320448, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.2655251026153564, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7144449949264526, + "num_tokens": 470573581.0, + "step": 18186 + }, + { + "epoch": 1.9972545574346585, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.627490758895874, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.716361403465271, + "num_tokens": 470594915.0, + "step": 18187 + }, + { + "epoch": 1.997364375137272, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.4769139289855957, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7160940170288086, + "num_tokens": 470619696.0, + "step": 18188 + }, + { + "epoch": 1.9974741928398858, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2875254154205322, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6873738169670105, + "num_tokens": 470648134.0, + "step": 18189 + }, + { + "epoch": 1.9975840105424996, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4187369346618652, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7239154577255249, + "num_tokens": 470672694.0, + "step": 18190 + }, + { + "epoch": 1.997693828245113, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.365579128265381, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7467844486236572, + "num_tokens": 470698698.0, + "step": 18191 + }, + { + "epoch": 1.9978036459477266, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.342616319656372, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7201269268989563, + "num_tokens": 470725993.0, + "step": 18192 + }, + { + "epoch": 1.9979134636503404, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3310964107513428, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7069804668426514, + "num_tokens": 470751735.0, + "step": 18193 + }, + { + "epoch": 1.9980232813529542, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 3.26065993309021, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7266972064971924, + "num_tokens": 470768977.0, + "step": 18194 + }, + { + "epoch": 1.998133099055568, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2152457237243652, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7065867185592651, + "num_tokens": 470797383.0, + "step": 18195 + }, + { + "epoch": 1.9982429167581814, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.8531689643859863, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7349328994750977, + "num_tokens": 470818262.0, + "step": 18196 + }, + { + "epoch": 1.998352734460795, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.300121784210205, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6977427005767822, + "num_tokens": 470845484.0, + "step": 18197 + }, + { + "epoch": 1.9984625521634087, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4530601501464844, + "learning_rate": 1e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.6894627809524536, + "num_tokens": 470870283.0, + "step": 18198 + }, + { + "epoch": 1.9985723698660225, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5676426887512207, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7194400429725647, + "num_tokens": 470891092.0, + "step": 18199 + }, + { + "epoch": 1.998682187568636, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.340319871902466, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7167441248893738, + "num_tokens": 470916592.0, + "step": 18200 + }, + { + "epoch": 1.9987920052712496, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2855823040008545, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7283074855804443, + "num_tokens": 470943251.0, + "step": 18201 + }, + { + "epoch": 1.9989018229738633, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.317875862121582, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7269929051399231, + "num_tokens": 470970150.0, + "step": 18202 + }, + { + "epoch": 1.999011640676477, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.138092041015625, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6825812458992004, + "num_tokens": 471001184.0, + "step": 18203 + }, + { + "epoch": 1.9991214583790908, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4466042518615723, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7154254913330078, + "num_tokens": 471025144.0, + "step": 18204 + }, + { + "epoch": 1.9992312760817044, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0711920261383057, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6968487501144409, + "num_tokens": 471059042.0, + "step": 18205 + }, + { + "epoch": 1.999341093784318, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1423614025115967, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6816302537918091, + "num_tokens": 471090984.0, + "step": 18206 + }, + { + "epoch": 1.9994509114869317, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 2.58073091506958, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7239102721214294, + "num_tokens": 471111508.0, + "step": 18207 + }, + { + "epoch": 1.9995607291895454, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.37937068939209, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7006237506866455, + "num_tokens": 471136394.0, + "step": 18208 + }, + { + "epoch": 1.9996705468921592, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3457934856414795, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7055294513702393, + "num_tokens": 471164069.0, + "step": 18209 + }, + { + "epoch": 1.9997803645947727, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.461552143096924, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7105698585510254, + "num_tokens": 471189023.0, + "step": 18210 + }, + { + "epoch": 1.9998901822973862, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6018056869506836, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7239621877670288, + "num_tokens": 471209443.0, + "step": 18211 + }, + { + "epoch": 2.0, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4548585414886475, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6955435872077942, + "num_tokens": 471235014.0, + "step": 18212 + }, + { + "epoch": 2.0001098177026138, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3554654121398926, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.712601900100708, + "num_tokens": 471260654.0, + "step": 18213 + }, + { + "epoch": 2.0002196354052275, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.1955411434173584, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7142942547798157, + "num_tokens": 471286421.0, + "step": 18214 + }, + { + "epoch": 2.000329453107841, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.414562463760376, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7157471179962158, + "num_tokens": 471310258.0, + "step": 18215 + }, + { + "epoch": 2.0004392708104546, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2006425857543945, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7101501226425171, + "num_tokens": 471338723.0, + "step": 18216 + }, + { + "epoch": 2.0005490885130683, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.141113758087158, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7159092426300049, + "num_tokens": 471369007.0, + "step": 18217 + }, + { + "epoch": 2.000658906215682, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6328036785125732, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7291908860206604, + "num_tokens": 471390448.0, + "step": 18218 + }, + { + "epoch": 2.0007687239182954, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2924234867095947, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7065820097923279, + "num_tokens": 471420002.0, + "step": 18219 + }, + { + "epoch": 2.000878541620909, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7038283348083496, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.711726188659668, + "num_tokens": 471442498.0, + "step": 18220 + }, + { + "epoch": 2.000988359323523, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.682961940765381, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7208085060119629, + "num_tokens": 471463307.0, + "step": 18221 + }, + { + "epoch": 2.0010981770261367, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4888973236083984, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7391051054000854, + "num_tokens": 471487919.0, + "step": 18222 + }, + { + "epoch": 2.0012079947287504, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3948142528533936, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7449020147323608, + "num_tokens": 471514828.0, + "step": 18223 + }, + { + "epoch": 2.0013178124313638, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6044023036956787, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7435388565063477, + "num_tokens": 471537025.0, + "step": 18224 + }, + { + "epoch": 2.0014276301339775, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 7.152444839477539, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7093387842178345, + "num_tokens": 471557930.0, + "step": 18225 + }, + { + "epoch": 2.0015374478365913, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3654019832611084, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7066066861152649, + "num_tokens": 471582641.0, + "step": 18226 + }, + { + "epoch": 2.001647265539205, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5610239505767822, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7414369583129883, + "num_tokens": 471604738.0, + "step": 18227 + }, + { + "epoch": 2.001757083241819, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.292228937149048, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6889119148254395, + "num_tokens": 471634425.0, + "step": 18228 + }, + { + "epoch": 2.001866900944432, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3578526973724365, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7133994698524475, + "num_tokens": 471660871.0, + "step": 18229 + }, + { + "epoch": 2.001976718647046, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2907214164733887, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7151093482971191, + "num_tokens": 471688335.0, + "step": 18230 + }, + { + "epoch": 2.0020865363496596, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2479145526885986, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6953336000442505, + "num_tokens": 471718337.0, + "step": 18231 + }, + { + "epoch": 2.0021963540522734, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.113319158554077, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7023544311523438, + "num_tokens": 471751966.0, + "step": 18232 + }, + { + "epoch": 2.0023061717548867, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6592609882354736, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7340213656425476, + "num_tokens": 471773386.0, + "step": 18233 + }, + { + "epoch": 2.0024159894575004, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.593109130859375, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7201147675514221, + "num_tokens": 471795477.0, + "step": 18234 + }, + { + "epoch": 2.002525807160114, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.315722703933716, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7092180252075195, + "num_tokens": 471825957.0, + "step": 18235 + }, + { + "epoch": 2.002635624862728, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2527618408203125, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.708170473575592, + "num_tokens": 471853767.0, + "step": 18236 + }, + { + "epoch": 2.0027454425653417, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3687002658843994, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.7044433951377869, + "num_tokens": 471883082.0, + "step": 18237 + }, + { + "epoch": 2.002855260267955, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.327437162399292, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7245997190475464, + "num_tokens": 471910218.0, + "step": 18238 + }, + { + "epoch": 2.0029650779705688, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.359884738922119, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7114023566246033, + "num_tokens": 471936993.0, + "step": 18239 + }, + { + "epoch": 2.0030748956731825, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.403132200241089, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6899939775466919, + "num_tokens": 471964226.0, + "step": 18240 + }, + { + "epoch": 2.0031847133757963, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5318078994750977, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7180333137512207, + "num_tokens": 471986803.0, + "step": 18241 + }, + { + "epoch": 2.00329453107841, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5925228595733643, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7140834331512451, + "num_tokens": 472010234.0, + "step": 18242 + }, + { + "epoch": 2.0034043487810234, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.649559497833252, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7207516431808472, + "num_tokens": 472032126.0, + "step": 18243 + }, + { + "epoch": 2.003514166483637, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 4.510548114776611, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7095868587493896, + "num_tokens": 472057020.0, + "step": 18244 + }, + { + "epoch": 2.003623984186251, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6144118309020996, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7023453712463379, + "num_tokens": 472079754.0, + "step": 18245 + }, + { + "epoch": 2.0037338018888646, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2378618717193604, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7357468008995056, + "num_tokens": 472107357.0, + "step": 18246 + }, + { + "epoch": 2.003843619591478, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.478861093521118, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7236919403076172, + "num_tokens": 472132314.0, + "step": 18247 + }, + { + "epoch": 2.0039534372940917, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5498251914978027, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7067244052886963, + "num_tokens": 472159056.0, + "step": 18248 + }, + { + "epoch": 2.0040632549967055, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.279085159301758, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7342955470085144, + "num_tokens": 472187610.0, + "step": 18249 + }, + { + "epoch": 2.004173072699319, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.543466806411743, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.717266857624054, + "num_tokens": 472211027.0, + "step": 18250 + }, + { + "epoch": 2.004282890401933, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.239710807800293, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7140969634056091, + "num_tokens": 472240380.0, + "step": 18251 + }, + { + "epoch": 2.0043927081045463, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.143293857574463, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7345755100250244, + "num_tokens": 472270893.0, + "step": 18252 + }, + { + "epoch": 2.00450252580716, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3899154663085938, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7355897426605225, + "num_tokens": 472296851.0, + "step": 18253 + }, + { + "epoch": 2.004612343509774, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3913257122039795, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7148076295852661, + "num_tokens": 472324270.0, + "step": 18254 + }, + { + "epoch": 2.0047221612123876, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.38144850730896, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.725983202457428, + "num_tokens": 472350052.0, + "step": 18255 + }, + { + "epoch": 2.0048319789150013, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3709514141082764, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.718013346195221, + "num_tokens": 472377760.0, + "step": 18256 + }, + { + "epoch": 2.0049417966176146, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.081265687942505, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7069063186645508, + "num_tokens": 472413478.0, + "step": 18257 + }, + { + "epoch": 2.0050516143202284, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.355642080307007, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6890798807144165, + "num_tokens": 472441679.0, + "step": 18258 + }, + { + "epoch": 2.005161432022842, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.436816453933716, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7309146523475647, + "num_tokens": 472467174.0, + "step": 18259 + }, + { + "epoch": 2.005271249725456, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4003067016601562, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7274060249328613, + "num_tokens": 472491450.0, + "step": 18260 + }, + { + "epoch": 2.005381067428069, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.707206964492798, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7334062457084656, + "num_tokens": 472513867.0, + "step": 18261 + }, + { + "epoch": 2.005490885130683, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4322757720947266, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.723290741443634, + "num_tokens": 472538909.0, + "step": 18262 + }, + { + "epoch": 2.0056007028332967, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.0789661407470703, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7082951664924622, + "num_tokens": 472573516.0, + "step": 18263 + }, + { + "epoch": 2.0057105205359105, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.210881233215332, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7031815052032471, + "num_tokens": 472604040.0, + "step": 18264 + }, + { + "epoch": 2.0058203382385242, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.525858163833618, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7224973440170288, + "num_tokens": 472628842.0, + "step": 18265 + }, + { + "epoch": 2.0059301559411375, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6176490783691406, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6917619705200195, + "num_tokens": 472653793.0, + "step": 18266 + }, + { + "epoch": 2.0060399736437513, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3416659832000732, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6888000965118408, + "num_tokens": 472682750.0, + "step": 18267 + }, + { + "epoch": 2.006149791346365, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3987550735473633, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6912219524383545, + "num_tokens": 472708280.0, + "step": 18268 + }, + { + "epoch": 2.006259609048979, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.483046531677246, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7061728239059448, + "num_tokens": 472732535.0, + "step": 18269 + }, + { + "epoch": 2.0063694267515926, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.285195827484131, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6957059502601624, + "num_tokens": 472761119.0, + "step": 18270 + }, + { + "epoch": 2.006479244454206, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.758615016937256, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7294608354568481, + "num_tokens": 472782492.0, + "step": 18271 + }, + { + "epoch": 2.0065890621568196, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.3942711353302, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7075478434562683, + "num_tokens": 472809592.0, + "step": 18272 + }, + { + "epoch": 2.0066988798594334, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2362890243530273, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7104300856590271, + "num_tokens": 472838346.0, + "step": 18273 + }, + { + "epoch": 2.006808697562047, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2804248332977295, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6971598863601685, + "num_tokens": 472868408.0, + "step": 18274 + }, + { + "epoch": 2.0069185152646605, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.6167025566101074, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7287877798080444, + "num_tokens": 472891999.0, + "step": 18275 + }, + { + "epoch": 2.0070283329672742, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.7462661266326904, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7314132452011108, + "num_tokens": 472912478.0, + "step": 18276 + }, + { + "epoch": 2.007138150669888, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.4964053630828857, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.706814169883728, + "num_tokens": 472937871.0, + "step": 18277 + }, + { + "epoch": 2.0072479683725017, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.2308812141418457, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.7007282972335815, + "num_tokens": 472968653.0, + "step": 18278 + }, + { + "epoch": 2.0073577860751155, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 4.24062442779541, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7162811160087585, + "num_tokens": 472993966.0, + "step": 18279 + }, + { + "epoch": 2.007467603777729, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 2.5610105991363525, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7119685411453247, + "num_tokens": 473019198.0, + "step": 18280 + }, + { + "epoch": 2.0075774214803426, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3920843601226807, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7102930545806885, + "num_tokens": 473047784.0, + "step": 18281 + }, + { + "epoch": 2.0076872391829563, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.493943214416504, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7047250866889954, + "num_tokens": 473073103.0, + "step": 18282 + }, + { + "epoch": 2.00779705688557, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4177770614624023, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7286520004272461, + "num_tokens": 473098209.0, + "step": 18283 + }, + { + "epoch": 2.0079068745881834, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4185242652893066, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6846648454666138, + "num_tokens": 473123539.0, + "step": 18284 + }, + { + "epoch": 2.008016692290797, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1654062271118164, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6955780982971191, + "num_tokens": 473158145.0, + "step": 18285 + }, + { + "epoch": 2.008126509993411, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3538458347320557, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7055472135543823, + "num_tokens": 473185679.0, + "step": 18286 + }, + { + "epoch": 2.0082363276960247, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.737046003341675, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7242794036865234, + "num_tokens": 473205197.0, + "step": 18287 + }, + { + "epoch": 2.0083461453986384, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4737911224365234, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.72472083568573, + "num_tokens": 473230948.0, + "step": 18288 + }, + { + "epoch": 2.0084559631012517, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7168917655944824, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7432936429977417, + "num_tokens": 473251055.0, + "step": 18289 + }, + { + "epoch": 2.0085657808038655, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.6250267028808594, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7166061401367188, + "num_tokens": 473274792.0, + "step": 18290 + }, + { + "epoch": 2.0086755985064793, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1636533737182617, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7391379475593567, + "num_tokens": 473304591.0, + "step": 18291 + }, + { + "epoch": 2.008785416209093, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.29366397857666, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.721062421798706, + "num_tokens": 473333610.0, + "step": 18292 + }, + { + "epoch": 2.0088952339117068, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.878444194793701, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7212160229682922, + "num_tokens": 473352052.0, + "step": 18293 + }, + { + "epoch": 2.00900505161432, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.12362003326416, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6987078785896301, + "num_tokens": 473384444.0, + "step": 18294 + }, + { + "epoch": 2.009114869316934, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1695058345794678, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7081918120384216, + "num_tokens": 473415544.0, + "step": 18295 + }, + { + "epoch": 2.0092246870195476, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.415550470352173, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7113897204399109, + "num_tokens": 473443069.0, + "step": 18296 + }, + { + "epoch": 2.0093345047221614, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.420830011367798, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7262564301490784, + "num_tokens": 473467650.0, + "step": 18297 + }, + { + "epoch": 2.0094443224247747, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2170684337615967, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7277576923370361, + "num_tokens": 473495979.0, + "step": 18298 + }, + { + "epoch": 2.0095541401273884, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3250391483306885, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7287567853927612, + "num_tokens": 473521867.0, + "step": 18299 + }, + { + "epoch": 2.009663957830002, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.232351064682007, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7204078435897827, + "num_tokens": 473551775.0, + "step": 18300 + }, + { + "epoch": 2.009773775532616, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.8853847980499268, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7369627356529236, + "num_tokens": 473570580.0, + "step": 18301 + }, + { + "epoch": 2.0098835932352297, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.8712992668151855, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7244638204574585, + "num_tokens": 473589849.0, + "step": 18302 + }, + { + "epoch": 2.009993410937843, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.13503360748291, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7151952981948853, + "num_tokens": 473621943.0, + "step": 18303 + }, + { + "epoch": 2.0101032286404568, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5512993335723877, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7234689593315125, + "num_tokens": 473644412.0, + "step": 18304 + }, + { + "epoch": 2.0102130463430705, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.038681983947754, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7263203859329224, + "num_tokens": 473681233.0, + "step": 18305 + }, + { + "epoch": 2.0103228640456843, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.643249750137329, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7291662096977234, + "num_tokens": 473703760.0, + "step": 18306 + }, + { + "epoch": 2.010432681748298, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4488751888275146, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7125441431999207, + "num_tokens": 473728373.0, + "step": 18307 + }, + { + "epoch": 2.0105424994509113, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4428648948669434, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.713849663734436, + "num_tokens": 473754675.0, + "step": 18308 + }, + { + "epoch": 2.010652317153525, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5138564109802246, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7212053537368774, + "num_tokens": 473776730.0, + "step": 18309 + }, + { + "epoch": 2.010762134856139, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2977349758148193, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7097603678703308, + "num_tokens": 473806682.0, + "step": 18310 + }, + { + "epoch": 2.0108719525587526, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.880298137664795, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6934903860092163, + "num_tokens": 473837602.0, + "step": 18311 + }, + { + "epoch": 2.010981770261366, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.486637830734253, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7239931225776672, + "num_tokens": 473861516.0, + "step": 18312 + }, + { + "epoch": 2.0110915879639797, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.350430488586426, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7330777645111084, + "num_tokens": 473887367.0, + "step": 18313 + }, + { + "epoch": 2.0112014056665934, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4309773445129395, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7095965147018433, + "num_tokens": 473914911.0, + "step": 18314 + }, + { + "epoch": 2.011311223369207, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.531463861465454, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7075109481811523, + "num_tokens": 473941329.0, + "step": 18315 + }, + { + "epoch": 2.011421041071821, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.758901357650757, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7114531397819519, + "num_tokens": 473962147.0, + "step": 18316 + }, + { + "epoch": 2.0115308587744343, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.473925828933716, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7060308456420898, + "num_tokens": 473989895.0, + "step": 18317 + }, + { + "epoch": 2.011640676477048, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5973868370056152, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7493822574615479, + "num_tokens": 474010572.0, + "step": 18318 + }, + { + "epoch": 2.011750494179662, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 8.65211009979248, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7162374258041382, + "num_tokens": 474032697.0, + "step": 18319 + }, + { + "epoch": 2.0118603118822755, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1762466430664062, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7386499643325806, + "num_tokens": 474063383.0, + "step": 18320 + }, + { + "epoch": 2.0119701295848893, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4521615505218506, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7167149186134338, + "num_tokens": 474091568.0, + "step": 18321 + }, + { + "epoch": 2.0120799472875026, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.569783926010132, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7420182824134827, + "num_tokens": 474113882.0, + "step": 18322 + }, + { + "epoch": 2.0121897649901164, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2703042030334473, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7195562720298767, + "num_tokens": 474143063.0, + "step": 18323 + }, + { + "epoch": 2.01229958269273, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2667906284332275, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7562367916107178, + "num_tokens": 474167338.0, + "step": 18324 + }, + { + "epoch": 2.012409400395344, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.537797212600708, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7310370206832886, + "num_tokens": 474191108.0, + "step": 18325 + }, + { + "epoch": 2.012519218097957, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.427417516708374, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7146592140197754, + "num_tokens": 474216320.0, + "step": 18326 + }, + { + "epoch": 2.012629035800571, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2154898643493652, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7144745588302612, + "num_tokens": 474248732.0, + "step": 18327 + }, + { + "epoch": 2.0127388535031847, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5954935550689697, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7386994361877441, + "num_tokens": 474270619.0, + "step": 18328 + }, + { + "epoch": 2.0128486712057985, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.578252077102661, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7522351145744324, + "num_tokens": 474293168.0, + "step": 18329 + }, + { + "epoch": 2.0129584889084122, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.544214963912964, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7179048657417297, + "num_tokens": 474318595.0, + "step": 18330 + }, + { + "epoch": 2.0130683066110255, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3396975994110107, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7235192060470581, + "num_tokens": 474346507.0, + "step": 18331 + }, + { + "epoch": 2.0131781243136393, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3184573650360107, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7024023532867432, + "num_tokens": 474374731.0, + "step": 18332 + }, + { + "epoch": 2.013287942016253, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.6188864707946777, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7324307560920715, + "num_tokens": 474396677.0, + "step": 18333 + }, + { + "epoch": 2.013397759718867, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3026158809661865, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7090983390808105, + "num_tokens": 474424538.0, + "step": 18334 + }, + { + "epoch": 2.01350757742148, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7393009662628174, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7405551671981812, + "num_tokens": 474444738.0, + "step": 18335 + }, + { + "epoch": 2.013617395124094, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.723907470703125, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7430194616317749, + "num_tokens": 474464517.0, + "step": 18336 + }, + { + "epoch": 2.0137272128267076, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.278618335723877, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7121536731719971, + "num_tokens": 474495613.0, + "step": 18337 + }, + { + "epoch": 2.0138370305293214, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5080292224884033, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6939874887466431, + "num_tokens": 474521657.0, + "step": 18338 + }, + { + "epoch": 2.013946848231935, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3783962726593018, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.713405430316925, + "num_tokens": 474549732.0, + "step": 18339 + }, + { + "epoch": 2.0140566659345485, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.246591091156006, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7152504324913025, + "num_tokens": 474578853.0, + "step": 18340 + }, + { + "epoch": 2.014166483637162, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.234600305557251, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.71065753698349, + "num_tokens": 474609898.0, + "step": 18341 + }, + { + "epoch": 2.014276301339776, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.745274543762207, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7317587733268738, + "num_tokens": 474630096.0, + "step": 18342 + }, + { + "epoch": 2.0143861190423897, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.428169012069702, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7390140295028687, + "num_tokens": 474655330.0, + "step": 18343 + }, + { + "epoch": 2.0144959367450035, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.597630500793457, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7112260460853577, + "num_tokens": 474679402.0, + "step": 18344 + }, + { + "epoch": 2.014605754447617, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3823392391204834, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7260550856590271, + "num_tokens": 474709822.0, + "step": 18345 + }, + { + "epoch": 2.0147155721502306, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4677274227142334, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7437329292297363, + "num_tokens": 474734308.0, + "step": 18346 + }, + { + "epoch": 2.0148253898528443, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.611078977584839, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7367420792579651, + "num_tokens": 474757181.0, + "step": 18347 + }, + { + "epoch": 2.014935207555458, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7182273864746094, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7107125520706177, + "num_tokens": 474779652.0, + "step": 18348 + }, + { + "epoch": 2.0150450252580714, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.6694180965423584, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7134855389595032, + "num_tokens": 474802679.0, + "step": 18349 + }, + { + "epoch": 2.015154842960685, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5881435871124268, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7135607004165649, + "num_tokens": 474826802.0, + "step": 18350 + }, + { + "epoch": 2.015264660663299, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4393956661224365, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7150816917419434, + "num_tokens": 474852591.0, + "step": 18351 + }, + { + "epoch": 2.0153744783659127, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.315328598022461, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7294144630432129, + "num_tokens": 474881174.0, + "step": 18352 + }, + { + "epoch": 2.0154842960685264, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2599120140075684, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7205415964126587, + "num_tokens": 474912081.0, + "step": 18353 + }, + { + "epoch": 2.0155941137711397, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.635115146636963, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7030482888221741, + "num_tokens": 474935198.0, + "step": 18354 + }, + { + "epoch": 2.0157039314737535, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.669201135635376, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7064970135688782, + "num_tokens": 474958871.0, + "step": 18355 + }, + { + "epoch": 2.0158137491763672, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.514927864074707, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7158221006393433, + "num_tokens": 474982605.0, + "step": 18356 + }, + { + "epoch": 2.015923566878981, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.587040901184082, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7103544473648071, + "num_tokens": 475006223.0, + "step": 18357 + }, + { + "epoch": 2.0160333845815948, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3609728813171387, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.713595449924469, + "num_tokens": 475034253.0, + "step": 18358 + }, + { + "epoch": 2.016143202284208, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.328434467315674, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7074259519577026, + "num_tokens": 475064327.0, + "step": 18359 + }, + { + "epoch": 2.016253019986822, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.497267961502075, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7201821804046631, + "num_tokens": 475088498.0, + "step": 18360 + }, + { + "epoch": 2.0163628376894356, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4259843826293945, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7198449373245239, + "num_tokens": 475113631.0, + "step": 18361 + }, + { + "epoch": 2.0164726553920493, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2955362796783447, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7094725966453552, + "num_tokens": 475144330.0, + "step": 18362 + }, + { + "epoch": 2.0165824730946627, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4548165798187256, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7084096670150757, + "num_tokens": 475173602.0, + "step": 18363 + }, + { + "epoch": 2.0166922907972764, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.5389177799224854, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7227212190628052, + "num_tokens": 475197199.0, + "step": 18364 + }, + { + "epoch": 2.01680210849989, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1682028770446777, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7400817275047302, + "num_tokens": 475228261.0, + "step": 18365 + }, + { + "epoch": 2.016911926202504, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.452371120452881, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7059578895568848, + "num_tokens": 475255246.0, + "step": 18366 + }, + { + "epoch": 2.0170217439051177, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.589043378829956, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.739761233329773, + "num_tokens": 475278080.0, + "step": 18367 + }, + { + "epoch": 2.017131561607731, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7224833965301514, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.734053909778595, + "num_tokens": 475297692.0, + "step": 18368 + }, + { + "epoch": 2.0172413793103448, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.440847873687744, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6920143365859985, + "num_tokens": 475323342.0, + "step": 18369 + }, + { + "epoch": 2.0173511970129585, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4799208641052246, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7209359407424927, + "num_tokens": 475346994.0, + "step": 18370 + }, + { + "epoch": 2.0174610147155723, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4125545024871826, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7168459892272949, + "num_tokens": 475372315.0, + "step": 18371 + }, + { + "epoch": 2.017570832418186, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4678425788879395, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7336332201957703, + "num_tokens": 475396438.0, + "step": 18372 + }, + { + "epoch": 2.0176806501207993, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.448096990585327, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.702961802482605, + "num_tokens": 475422077.0, + "step": 18373 + }, + { + "epoch": 2.017790467823413, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.412527322769165, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7266266942024231, + "num_tokens": 475447617.0, + "step": 18374 + }, + { + "epoch": 2.017900285526027, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.580357313156128, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7257497310638428, + "num_tokens": 475471261.0, + "step": 18375 + }, + { + "epoch": 2.0180101032286406, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.684877395629883, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.722944438457489, + "num_tokens": 475492817.0, + "step": 18376 + }, + { + "epoch": 2.018119920931254, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.433006525039673, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.708085298538208, + "num_tokens": 475519515.0, + "step": 18377 + }, + { + "epoch": 2.0182297386338677, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.6104414463043213, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7170635461807251, + "num_tokens": 475546017.0, + "step": 18378 + }, + { + "epoch": 2.0183395563364814, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.413583517074585, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6962249875068665, + "num_tokens": 475576814.0, + "step": 18379 + }, + { + "epoch": 2.018449374039095, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4650869369506836, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.7023352384567261, + "num_tokens": 475604395.0, + "step": 18380 + }, + { + "epoch": 2.018559191741709, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3145527839660645, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.699516236782074, + "num_tokens": 475632406.0, + "step": 18381 + }, + { + "epoch": 2.0186690094443223, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.1464648246765137, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.707808792591095, + "num_tokens": 475663063.0, + "step": 18382 + }, + { + "epoch": 2.018778827146936, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7323789596557617, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7227977514266968, + "num_tokens": 475683086.0, + "step": 18383 + }, + { + "epoch": 2.0188886448495498, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.8267688751220703, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7300853133201599, + "num_tokens": 475701641.0, + "step": 18384 + }, + { + "epoch": 2.0189984625521635, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5279009342193604, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7065765857696533, + "num_tokens": 475725651.0, + "step": 18385 + }, + { + "epoch": 2.0191082802547773, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.1989822387695312, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.717900276184082, + "num_tokens": 475755963.0, + "step": 18386 + }, + { + "epoch": 2.0192180979573906, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5491061210632324, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6980938911437988, + "num_tokens": 475782500.0, + "step": 18387 + }, + { + "epoch": 2.0193279156600044, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4323205947875977, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7079482078552246, + "num_tokens": 475810085.0, + "step": 18388 + }, + { + "epoch": 2.019437733362618, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6795504093170166, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7226629257202148, + "num_tokens": 475831769.0, + "step": 18389 + }, + { + "epoch": 2.019547551065232, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3403608798980713, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7040894031524658, + "num_tokens": 475860288.0, + "step": 18390 + }, + { + "epoch": 2.019657368767845, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.60433030128479, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.739657998085022, + "num_tokens": 475881887.0, + "step": 18391 + }, + { + "epoch": 2.019767186470459, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.9789953231811523, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7024080753326416, + "num_tokens": 475907087.0, + "step": 18392 + }, + { + "epoch": 2.0198770041730727, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.595435619354248, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7140513062477112, + "num_tokens": 475930532.0, + "step": 18393 + }, + { + "epoch": 2.0199868218756865, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.429734230041504, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7302490472793579, + "num_tokens": 475956755.0, + "step": 18394 + }, + { + "epoch": 2.0200966395783, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.2555527687072754, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7199803590774536, + "num_tokens": 475988953.0, + "step": 18395 + }, + { + "epoch": 2.0202064572809135, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.7388057708740234, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7319384217262268, + "num_tokens": 476010678.0, + "step": 18396 + }, + { + "epoch": 2.0203162749835273, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4488613605499268, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7249877452850342, + "num_tokens": 476038599.0, + "step": 18397 + }, + { + "epoch": 2.020426092686141, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.389820098876953, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7026203870773315, + "num_tokens": 476067170.0, + "step": 18398 + }, + { + "epoch": 2.020535910388755, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.4988343715667725, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7224059104919434, + "num_tokens": 476093353.0, + "step": 18399 + }, + { + "epoch": 2.020645728091368, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3373334407806396, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7149882316589355, + "num_tokens": 476120892.0, + "step": 18400 + }, + { + "epoch": 2.020755545793982, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.33345365524292, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7147706747055054, + "num_tokens": 476148830.0, + "step": 18401 + }, + { + "epoch": 2.0208653634965956, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.3120532035827637, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6925104260444641, + "num_tokens": 476176269.0, + "step": 18402 + }, + { + "epoch": 2.0209751811992094, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.495373010635376, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6917136907577515, + "num_tokens": 476203739.0, + "step": 18403 + }, + { + "epoch": 2.021084998901823, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5237185955047607, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.724203884601593, + "num_tokens": 476227125.0, + "step": 18404 + }, + { + "epoch": 2.0211948166044365, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.431058168411255, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7102543115615845, + "num_tokens": 476253997.0, + "step": 18405 + }, + { + "epoch": 2.02130463430705, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.52335786819458, + "learning_rate": 1e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7621879577636719, + "num_tokens": 476276686.0, + "step": 18406 + }, + { + "epoch": 2.021414452009664, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.127967596054077, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7081860303878784, + "num_tokens": 476307945.0, + "step": 18407 + }, + { + "epoch": 2.0215242697122777, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.083512783050537, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7049145698547363, + "num_tokens": 476344153.0, + "step": 18408 + }, + { + "epoch": 2.0216340874148915, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.307894229888916, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7325677871704102, + "num_tokens": 476372534.0, + "step": 18409 + }, + { + "epoch": 2.021743905117505, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.295720100402832, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.706990659236908, + "num_tokens": 476400143.0, + "step": 18410 + }, + { + "epoch": 2.0218537228201185, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6162540912628174, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7376857995986938, + "num_tokens": 476421887.0, + "step": 18411 + }, + { + "epoch": 2.0219635405227323, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.389361619949341, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7181724309921265, + "num_tokens": 476448408.0, + "step": 18412 + }, + { + "epoch": 2.022073358225346, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 2.8664743900299072, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.731122612953186, + "num_tokens": 476467945.0, + "step": 18413 + }, + { + "epoch": 2.0221831759279594, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.1273655891418457, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7008682489395142, + "num_tokens": 476504064.0, + "step": 18414 + }, + { + "epoch": 2.022292993630573, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.486417770385742, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.72551029920578, + "num_tokens": 476528563.0, + "step": 18415 + }, + { + "epoch": 2.022402811333187, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.678626537322998, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7204797863960266, + "num_tokens": 476551278.0, + "step": 18416 + }, + { + "epoch": 2.0225126290358006, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.725743293762207, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7409614324569702, + "num_tokens": 476571059.0, + "step": 18417 + }, + { + "epoch": 2.0226224467384144, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.333024740219116, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7207545042037964, + "num_tokens": 476601332.0, + "step": 18418 + }, + { + "epoch": 2.0227322644410277, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.9730896949768066, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7393227219581604, + "num_tokens": 476620721.0, + "step": 18419 + }, + { + "epoch": 2.0228420821436415, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.545696496963501, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7191031575202942, + "num_tokens": 476647984.0, + "step": 18420 + }, + { + "epoch": 2.0229518998462552, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.3798775672912598, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.721744954586029, + "num_tokens": 476675219.0, + "step": 18421 + }, + { + "epoch": 2.023061717548869, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.210709571838379, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7006043195724487, + "num_tokens": 476707059.0, + "step": 18422 + }, + { + "epoch": 2.0231715352514827, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.2540905475616455, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7314903736114502, + "num_tokens": 476734933.0, + "step": 18423 + }, + { + "epoch": 2.023281352954096, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4433932304382324, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7348232269287109, + "num_tokens": 476758364.0, + "step": 18424 + }, + { + "epoch": 2.02339117065671, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.2113044261932373, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.7040705680847168, + "num_tokens": 476788896.0, + "step": 18425 + }, + { + "epoch": 2.0235009883593236, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4623823165893555, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6963779926300049, + "num_tokens": 476814130.0, + "step": 18426 + }, + { + "epoch": 2.0236108060619373, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.364327907562256, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7283551096916199, + "num_tokens": 476839446.0, + "step": 18427 + }, + { + "epoch": 2.0237206237645506, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.2563860416412354, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7003052234649658, + "num_tokens": 476871147.0, + "step": 18428 + }, + { + "epoch": 2.0238304414671644, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.3649239540100098, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.70717453956604, + "num_tokens": 476897450.0, + "step": 18429 + }, + { + "epoch": 2.023940259169778, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6355199813842773, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7067818641662598, + "num_tokens": 476921078.0, + "step": 18430 + }, + { + "epoch": 2.024050076872392, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5435707569122314, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7165529727935791, + "num_tokens": 476944763.0, + "step": 18431 + }, + { + "epoch": 2.0241598945750057, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.870950222015381, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7303766012191772, + "num_tokens": 476964532.0, + "step": 18432 + }, + { + "epoch": 2.024269712277619, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6875720024108887, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7284013032913208, + "num_tokens": 476984769.0, + "step": 18433 + }, + { + "epoch": 2.0243795299802327, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4883246421813965, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7077536582946777, + "num_tokens": 477010814.0, + "step": 18434 + }, + { + "epoch": 2.0244893476828465, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.407158851623535, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7185289859771729, + "num_tokens": 477037902.0, + "step": 18435 + }, + { + "epoch": 2.0245991653854603, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5644822120666504, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7220624685287476, + "num_tokens": 477062536.0, + "step": 18436 + }, + { + "epoch": 2.024708983088074, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.551622152328491, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7239936590194702, + "num_tokens": 477088327.0, + "step": 18437 + }, + { + "epoch": 2.0248188007906873, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.439131736755371, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.743068516254425, + "num_tokens": 477111255.0, + "step": 18438 + }, + { + "epoch": 2.024928618493301, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.188551425933838, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7213914394378662, + "num_tokens": 477140742.0, + "step": 18439 + }, + { + "epoch": 2.025038436195915, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.414809226989746, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7094123363494873, + "num_tokens": 477167026.0, + "step": 18440 + }, + { + "epoch": 2.0251482538985286, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.581582546234131, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.714065432548523, + "num_tokens": 477189984.0, + "step": 18441 + }, + { + "epoch": 2.025258071601142, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4855968952178955, + "learning_rate": 1e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6836353540420532, + "num_tokens": 477216833.0, + "step": 18442 + }, + { + "epoch": 2.0253678893037557, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6383795738220215, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7110908627510071, + "num_tokens": 477239306.0, + "step": 18443 + }, + { + "epoch": 2.0254777070063694, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.6413235664367676, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7171769142150879, + "num_tokens": 477261024.0, + "step": 18444 + }, + { + "epoch": 2.025587524708983, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.3733508586883545, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7216272950172424, + "num_tokens": 477287270.0, + "step": 18445 + }, + { + "epoch": 2.025697342411597, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.340878486633301, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7186639308929443, + "num_tokens": 477315531.0, + "step": 18446 + }, + { + "epoch": 2.0258071601142102, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.396742820739746, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7212334871292114, + "num_tokens": 477340295.0, + "step": 18447 + }, + { + "epoch": 2.025916977816824, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.5595993995666504, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7286338806152344, + "num_tokens": 477362513.0, + "step": 18448 + }, + { + "epoch": 2.0260267955194378, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.2641594409942627, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6941717267036438, + "num_tokens": 477390622.0, + "step": 18449 + }, + { + "epoch": 2.0261366132220515, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.4459173679351807, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.720754861831665, + "num_tokens": 477415906.0, + "step": 18450 + }, + { + "epoch": 2.0262464309246653, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3420751094818115, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7180514335632324, + "num_tokens": 477442984.0, + "step": 18451 + }, + { + "epoch": 2.0263562486272786, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4062187671661377, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7300556898117065, + "num_tokens": 477467160.0, + "step": 18452 + }, + { + "epoch": 2.0264660663298923, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.210855007171631, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7230188846588135, + "num_tokens": 477494952.0, + "step": 18453 + }, + { + "epoch": 2.026575884032506, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5179636478424072, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7086834907531738, + "num_tokens": 477523578.0, + "step": 18454 + }, + { + "epoch": 2.02668570173512, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4698565006256104, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7241742610931396, + "num_tokens": 477547502.0, + "step": 18455 + }, + { + "epoch": 2.026795519437733, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.762885570526123, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.703303337097168, + "num_tokens": 477568762.0, + "step": 18456 + }, + { + "epoch": 2.026905337140347, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.965613603591919, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7286530137062073, + "num_tokens": 477591095.0, + "step": 18457 + }, + { + "epoch": 2.0270151548429607, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.498253107070923, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7215718030929565, + "num_tokens": 477615159.0, + "step": 18458 + }, + { + "epoch": 2.0271249725455744, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2978549003601074, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6868658065795898, + "num_tokens": 477646241.0, + "step": 18459 + }, + { + "epoch": 2.027234790248188, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.148813247680664, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.687811017036438, + "num_tokens": 477678701.0, + "step": 18460 + }, + { + "epoch": 2.0273446079508015, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.301287889480591, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6949741840362549, + "num_tokens": 477708276.0, + "step": 18461 + }, + { + "epoch": 2.0274544256534153, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.343088388442993, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7134156227111816, + "num_tokens": 477734701.0, + "step": 18462 + }, + { + "epoch": 2.027564243356029, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.349395275115967, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7102809548377991, + "num_tokens": 477761108.0, + "step": 18463 + }, + { + "epoch": 2.027674061058643, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3428256511688232, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7335430383682251, + "num_tokens": 477787164.0, + "step": 18464 + }, + { + "epoch": 2.027783878761256, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.600219964981079, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7141838073730469, + "num_tokens": 477811238.0, + "step": 18465 + }, + { + "epoch": 2.02789369646387, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2957077026367188, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7113516926765442, + "num_tokens": 477840378.0, + "step": 18466 + }, + { + "epoch": 2.0280035141664836, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3738815784454346, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.7021903395652771, + "num_tokens": 477866312.0, + "step": 18467 + }, + { + "epoch": 2.0281133318690974, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.471791982650757, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7346587181091309, + "num_tokens": 477889462.0, + "step": 18468 + }, + { + "epoch": 2.028223149571711, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5094242095947266, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7164345383644104, + "num_tokens": 477912523.0, + "step": 18469 + }, + { + "epoch": 2.0283329672743244, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5033278465270996, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7295513153076172, + "num_tokens": 477938584.0, + "step": 18470 + }, + { + "epoch": 2.028442784976938, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.337473154067993, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7100176215171814, + "num_tokens": 477969511.0, + "step": 18471 + }, + { + "epoch": 2.028552602679552, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5329749584198, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6986231803894043, + "num_tokens": 477996399.0, + "step": 18472 + }, + { + "epoch": 2.0286624203821657, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.6248691082000732, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7244997620582581, + "num_tokens": 478019229.0, + "step": 18473 + }, + { + "epoch": 2.0287722380847795, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.7024855613708496, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7313010692596436, + "num_tokens": 478040209.0, + "step": 18474 + }, + { + "epoch": 2.028882055787393, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5200438499450684, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7329925298690796, + "num_tokens": 478062881.0, + "step": 18475 + }, + { + "epoch": 2.0289918734900065, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.536562442779541, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7345176935195923, + "num_tokens": 478085248.0, + "step": 18476 + }, + { + "epoch": 2.0291016911926203, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.487560510635376, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7148690819740295, + "num_tokens": 478111876.0, + "step": 18477 + }, + { + "epoch": 2.029211508895234, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.384981870651245, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7181823253631592, + "num_tokens": 478140060.0, + "step": 18478 + }, + { + "epoch": 2.0293213265978474, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.1835854053497314, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6980863809585571, + "num_tokens": 478173027.0, + "step": 18479 + }, + { + "epoch": 2.029431144300461, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.493741035461426, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7208618521690369, + "num_tokens": 478197596.0, + "step": 18480 + }, + { + "epoch": 2.029540962003075, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3034536838531494, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.727750837802887, + "num_tokens": 478227789.0, + "step": 18481 + }, + { + "epoch": 2.0296507797056886, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2572572231292725, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7068452835083008, + "num_tokens": 478259865.0, + "step": 18482 + }, + { + "epoch": 2.0297605974083024, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.790358543395996, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7274808287620544, + "num_tokens": 478280288.0, + "step": 18483 + }, + { + "epoch": 2.0298704151109157, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.8726680278778076, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7167091369628906, + "num_tokens": 478301159.0, + "step": 18484 + }, + { + "epoch": 2.0299802328135295, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.530083179473877, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7223876714706421, + "num_tokens": 478326072.0, + "step": 18485 + }, + { + "epoch": 2.030090050516143, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3132224082946777, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6773547530174255, + "num_tokens": 478362978.0, + "step": 18486 + }, + { + "epoch": 2.030199868218757, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.160562515258789, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7179616689682007, + "num_tokens": 478395259.0, + "step": 18487 + }, + { + "epoch": 2.0303096859213707, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2757909297943115, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.707281231880188, + "num_tokens": 478424449.0, + "step": 18488 + }, + { + "epoch": 2.030419503623984, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.21889066696167, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7013309001922607, + "num_tokens": 478454349.0, + "step": 18489 + }, + { + "epoch": 2.030529321326598, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.768218755722046, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.7059524059295654, + "num_tokens": 478476519.0, + "step": 18490 + }, + { + "epoch": 2.0306391390292116, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5117475986480713, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7120636701583862, + "num_tokens": 478501584.0, + "step": 18491 + }, + { + "epoch": 2.0307489567318253, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.449949026107788, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7123045325279236, + "num_tokens": 478527028.0, + "step": 18492 + }, + { + "epoch": 2.0308587744344386, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.230029344558716, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7140488028526306, + "num_tokens": 478556497.0, + "step": 18493 + }, + { + "epoch": 2.0309685921370524, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.433368682861328, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7179169654846191, + "num_tokens": 478581600.0, + "step": 18494 + }, + { + "epoch": 2.031078409839666, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.471248149871826, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7033827304840088, + "num_tokens": 478608224.0, + "step": 18495 + }, + { + "epoch": 2.03118822754228, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.661614418029785, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7302846908569336, + "num_tokens": 478630025.0, + "step": 18496 + }, + { + "epoch": 2.0312980452448937, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.484532117843628, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7109944820404053, + "num_tokens": 478655944.0, + "step": 18497 + }, + { + "epoch": 2.031407862947507, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3950870037078857, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7060185670852661, + "num_tokens": 478682722.0, + "step": 18498 + }, + { + "epoch": 2.0315176806501207, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2970645427703857, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7096743583679199, + "num_tokens": 478712650.0, + "step": 18499 + }, + { + "epoch": 2.0316274983527345, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.9478355646133423, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7032123804092407, + "num_tokens": 478750751.0, + "step": 18500 + }, + { + "epoch": 2.0317373160553482, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2973599433898926, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6906454563140869, + "num_tokens": 478782213.0, + "step": 18501 + }, + { + "epoch": 2.031847133757962, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 32.354610443115234, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7096543908119202, + "num_tokens": 478806595.0, + "step": 18502 + }, + { + "epoch": 2.0319569514605753, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2093944549560547, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6917537450790405, + "num_tokens": 478842045.0, + "step": 18503 + }, + { + "epoch": 2.032066769163189, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.392716884613037, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7119568586349487, + "num_tokens": 478870674.0, + "step": 18504 + }, + { + "epoch": 2.032176586865803, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.7125916481018066, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7190842032432556, + "num_tokens": 478891731.0, + "step": 18505 + }, + { + "epoch": 2.0322864045684166, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4530792236328125, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7109102010726929, + "num_tokens": 478920241.0, + "step": 18506 + }, + { + "epoch": 2.03239622227103, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.258362293243408, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6944432854652405, + "num_tokens": 478951766.0, + "step": 18507 + }, + { + "epoch": 2.0325060399736437, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2220494747161865, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7060492634773254, + "num_tokens": 478984028.0, + "step": 18508 + }, + { + "epoch": 2.0326158576762574, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3132522106170654, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.7012918591499329, + "num_tokens": 479013463.0, + "step": 18509 + }, + { + "epoch": 2.032725675378871, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.509763717651367, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6940408945083618, + "num_tokens": 479038882.0, + "step": 18510 + }, + { + "epoch": 2.032835493081485, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.5385403633117676, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7052832841873169, + "num_tokens": 479063048.0, + "step": 18511 + }, + { + "epoch": 2.0329453107840982, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2810628414154053, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7017983198165894, + "num_tokens": 479092793.0, + "step": 18512 + }, + { + "epoch": 2.033055128486712, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.642519235610962, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7260013818740845, + "num_tokens": 479114094.0, + "step": 18513 + }, + { + "epoch": 2.0331649461893258, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4745914936065674, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7316629886627197, + "num_tokens": 479139288.0, + "step": 18514 + }, + { + "epoch": 2.0332747638919395, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.581270694732666, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7207791209220886, + "num_tokens": 479161789.0, + "step": 18515 + }, + { + "epoch": 2.033384581594553, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3831098079681396, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7089442610740662, + "num_tokens": 479187593.0, + "step": 18516 + }, + { + "epoch": 2.0334943992971666, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3159847259521484, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7184575796127319, + "num_tokens": 479215416.0, + "step": 18517 + }, + { + "epoch": 2.0336042169997803, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.283527374267578, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7139026522636414, + "num_tokens": 479242278.0, + "step": 18518 + }, + { + "epoch": 2.033714034702394, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2538950443267822, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7034217119216919, + "num_tokens": 479270308.0, + "step": 18519 + }, + { + "epoch": 2.033823852405008, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.346191644668579, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7291545867919922, + "num_tokens": 479297337.0, + "step": 18520 + }, + { + "epoch": 2.033933670107621, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.325364828109741, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7041972279548645, + "num_tokens": 479326319.0, + "step": 18521 + }, + { + "epoch": 2.034043487810235, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.354897975921631, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7181558012962341, + "num_tokens": 479353862.0, + "step": 18522 + }, + { + "epoch": 2.0341533055128487, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.346937656402588, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7148278951644897, + "num_tokens": 479380234.0, + "step": 18523 + }, + { + "epoch": 2.0342631232154624, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.6701221466064453, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7266206741333008, + "num_tokens": 479400222.0, + "step": 18524 + }, + { + "epoch": 2.034372940918076, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4927902221679688, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7186500430107117, + "num_tokens": 479425746.0, + "step": 18525 + }, + { + "epoch": 2.0344827586206895, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.643763542175293, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7260251045227051, + "num_tokens": 479449819.0, + "step": 18526 + }, + { + "epoch": 2.0345925763233033, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.4523262977600098, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7095187902450562, + "num_tokens": 479475323.0, + "step": 18527 + }, + { + "epoch": 2.034702394025917, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2660510540008545, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6981439590454102, + "num_tokens": 479505738.0, + "step": 18528 + }, + { + "epoch": 2.0348122117285308, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.669490098953247, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7221895456314087, + "num_tokens": 479527435.0, + "step": 18529 + }, + { + "epoch": 2.034922029431144, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.919992446899414, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7478249669075012, + "num_tokens": 479545792.0, + "step": 18530 + }, + { + "epoch": 2.035031847133758, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.3976242542266846, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.725092887878418, + "num_tokens": 479570974.0, + "step": 18531 + }, + { + "epoch": 2.0351416648363716, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.8988723754882812, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7479584217071533, + "num_tokens": 479588853.0, + "step": 18532 + }, + { + "epoch": 2.0352514825389854, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.810380458831787, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7252176403999329, + "num_tokens": 479609539.0, + "step": 18533 + }, + { + "epoch": 2.035361300241599, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4549062252044678, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7269313335418701, + "num_tokens": 479635985.0, + "step": 18534 + }, + { + "epoch": 2.0354711179442124, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4006905555725098, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7116901874542236, + "num_tokens": 479662486.0, + "step": 18535 + }, + { + "epoch": 2.035580935646826, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.554835557937622, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7125548124313354, + "num_tokens": 479686596.0, + "step": 18536 + }, + { + "epoch": 2.03569075334944, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.1710922718048096, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7256401777267456, + "num_tokens": 479718032.0, + "step": 18537 + }, + { + "epoch": 2.0358005710520537, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.600670099258423, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7159992456436157, + "num_tokens": 479743117.0, + "step": 18538 + }, + { + "epoch": 2.0359103887546675, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.2969038486480713, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7372632026672363, + "num_tokens": 479772035.0, + "step": 18539 + }, + { + "epoch": 2.0360202064572808, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 2.475825071334839, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.718124508857727, + "num_tokens": 479797667.0, + "step": 18540 + }, + { + "epoch": 2.0361300241598945, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3222501277923584, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6891798973083496, + "num_tokens": 479825969.0, + "step": 18541 + }, + { + "epoch": 2.0362398418625083, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5428764820098877, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7169311046600342, + "num_tokens": 479851139.0, + "step": 18542 + }, + { + "epoch": 2.036349659565122, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4705960750579834, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7078760266304016, + "num_tokens": 479876346.0, + "step": 18543 + }, + { + "epoch": 2.0364594772677354, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 7.225064277648926, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.717572808265686, + "num_tokens": 479897002.0, + "step": 18544 + }, + { + "epoch": 2.036569294970349, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.673640489578247, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7174860239028931, + "num_tokens": 479919893.0, + "step": 18545 + }, + { + "epoch": 2.036679112672963, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.37567400932312, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.715207576751709, + "num_tokens": 479949082.0, + "step": 18546 + }, + { + "epoch": 2.0367889303755766, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7620558738708496, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7126457691192627, + "num_tokens": 479972061.0, + "step": 18547 + }, + { + "epoch": 2.0368987480781904, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2652978897094727, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7435977458953857, + "num_tokens": 480000751.0, + "step": 18548 + }, + { + "epoch": 2.0370085657808037, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.249469757080078, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7131906747817993, + "num_tokens": 480029222.0, + "step": 18549 + }, + { + "epoch": 2.0371183834834174, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.599804162979126, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7358933687210083, + "num_tokens": 480053003.0, + "step": 18550 + }, + { + "epoch": 2.037228201186031, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2987051010131836, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7219312787055969, + "num_tokens": 480080069.0, + "step": 18551 + }, + { + "epoch": 2.037338018888645, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5325405597686768, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7178902626037598, + "num_tokens": 480103242.0, + "step": 18552 + }, + { + "epoch": 2.0374478365912587, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.547640562057495, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7142693400382996, + "num_tokens": 480128365.0, + "step": 18553 + }, + { + "epoch": 2.037557654293872, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.6321065425872803, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7347844839096069, + "num_tokens": 480150859.0, + "step": 18554 + }, + { + "epoch": 2.037667471996486, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.472505569458008, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7207320332527161, + "num_tokens": 480176966.0, + "step": 18555 + }, + { + "epoch": 2.0377772896990995, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.8631951808929443, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7283071875572205, + "num_tokens": 480196250.0, + "step": 18556 + }, + { + "epoch": 2.0378871074017133, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5887720584869385, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7356401681900024, + "num_tokens": 480218428.0, + "step": 18557 + }, + { + "epoch": 2.0379969251043266, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.273373603820801, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7312294840812683, + "num_tokens": 480247440.0, + "step": 18558 + }, + { + "epoch": 2.0381067428069404, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.8302392959594727, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7402256727218628, + "num_tokens": 480270897.0, + "step": 18559 + }, + { + "epoch": 2.038216560509554, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.311372756958008, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7320750951766968, + "num_tokens": 480299102.0, + "step": 18560 + }, + { + "epoch": 2.038326378212168, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7451114654541016, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7055895328521729, + "num_tokens": 480322035.0, + "step": 18561 + }, + { + "epoch": 2.0384361959147816, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.438822031021118, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7299240827560425, + "num_tokens": 480346328.0, + "step": 18562 + }, + { + "epoch": 2.038546013617395, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5334537029266357, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7232743501663208, + "num_tokens": 480370514.0, + "step": 18563 + }, + { + "epoch": 2.0386558313200087, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.6301305294036865, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7300516366958618, + "num_tokens": 480391909.0, + "step": 18564 + }, + { + "epoch": 2.0387656490226225, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.611741781234741, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7254095077514648, + "num_tokens": 480415314.0, + "step": 18565 + }, + { + "epoch": 2.0388754667252362, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.286794424057007, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.705391526222229, + "num_tokens": 480444693.0, + "step": 18566 + }, + { + "epoch": 2.03898528442785, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3577444553375244, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7107356190681458, + "num_tokens": 480472164.0, + "step": 18567 + }, + { + "epoch": 2.0390951021304633, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 6.876319408416748, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7435847520828247, + "num_tokens": 480504905.0, + "step": 18568 + }, + { + "epoch": 2.039204919833077, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.578577995300293, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7106403708457947, + "num_tokens": 480530076.0, + "step": 18569 + }, + { + "epoch": 2.039314737535691, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.242783784866333, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7081039547920227, + "num_tokens": 480560966.0, + "step": 18570 + }, + { + "epoch": 2.0394245552383046, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5017757415771484, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6970770955085754, + "num_tokens": 480586988.0, + "step": 18571 + }, + { + "epoch": 2.039534372940918, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.357570171356201, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6918103694915771, + "num_tokens": 480615259.0, + "step": 18572 + }, + { + "epoch": 2.0396441906435316, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.156320571899414, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7035171985626221, + "num_tokens": 480646526.0, + "step": 18573 + }, + { + "epoch": 2.0397540083461454, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7796082496643066, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7128562927246094, + "num_tokens": 480667027.0, + "step": 18574 + }, + { + "epoch": 2.039863826048759, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5012593269348145, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7201783657073975, + "num_tokens": 480690476.0, + "step": 18575 + }, + { + "epoch": 2.039973643751373, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3315205574035645, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6963130235671997, + "num_tokens": 480719978.0, + "step": 18576 + }, + { + "epoch": 2.0400834614539862, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2621874809265137, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7258222103118896, + "num_tokens": 480749163.0, + "step": 18577 + }, + { + "epoch": 2.0401932791566, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.718203067779541, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7349377870559692, + "num_tokens": 480769114.0, + "step": 18578 + }, + { + "epoch": 2.0403030968592137, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4646570682525635, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7486258745193481, + "num_tokens": 480792241.0, + "step": 18579 + }, + { + "epoch": 2.0404129145618275, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.488825798034668, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6987898945808411, + "num_tokens": 480817932.0, + "step": 18580 + }, + { + "epoch": 2.0405227322644413, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.6237051486968994, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7254930138587952, + "num_tokens": 480839867.0, + "step": 18581 + }, + { + "epoch": 2.0406325499670546, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4589366912841797, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7326485514640808, + "num_tokens": 480863797.0, + "step": 18582 + }, + { + "epoch": 2.0407423676696683, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4887101650238037, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7240256667137146, + "num_tokens": 480894772.0, + "step": 18583 + }, + { + "epoch": 2.040852185372282, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7223987579345703, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7203937768936157, + "num_tokens": 480917888.0, + "step": 18584 + }, + { + "epoch": 2.040962003074896, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.760204315185547, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7410359382629395, + "num_tokens": 480937887.0, + "step": 18585 + }, + { + "epoch": 2.041071820777509, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.691401243209839, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7331037521362305, + "num_tokens": 480959462.0, + "step": 18586 + }, + { + "epoch": 2.041181638480123, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.290492296218872, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7028324604034424, + "num_tokens": 480987958.0, + "step": 18587 + }, + { + "epoch": 2.0412914561827367, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7553529739379883, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7043371200561523, + "num_tokens": 481011290.0, + "step": 18588 + }, + { + "epoch": 2.0414012738853504, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 8.637845039367676, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7279344797134399, + "num_tokens": 481036578.0, + "step": 18589 + }, + { + "epoch": 2.041511091587964, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4366343021392822, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7035179138183594, + "num_tokens": 481065866.0, + "step": 18590 + }, + { + "epoch": 2.0416209092905775, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.51297664642334, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6867645978927612, + "num_tokens": 481092331.0, + "step": 18591 + }, + { + "epoch": 2.0417307269931912, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5890731811523438, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7104162573814392, + "num_tokens": 481116641.0, + "step": 18592 + }, + { + "epoch": 2.041840544695805, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2114555835723877, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7035472393035889, + "num_tokens": 481147880.0, + "step": 18593 + }, + { + "epoch": 2.0419503623984188, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.1979382038116455, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7162421941757202, + "num_tokens": 481177967.0, + "step": 18594 + }, + { + "epoch": 2.042060180101032, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7129669189453125, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7204478979110718, + "num_tokens": 481199113.0, + "step": 18595 + }, + { + "epoch": 2.042169997803646, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3291196823120117, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.717897355556488, + "num_tokens": 481224513.0, + "step": 18596 + }, + { + "epoch": 2.0422798155062596, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.648528814315796, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7204371094703674, + "num_tokens": 481247538.0, + "step": 18597 + }, + { + "epoch": 2.0423896332088733, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.7535762786865234, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7254213690757751, + "num_tokens": 481268354.0, + "step": 18598 + }, + { + "epoch": 2.042499450911487, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5243113040924072, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6991273760795593, + "num_tokens": 481292886.0, + "step": 18599 + }, + { + "epoch": 2.0426092686141004, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4703261852264404, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7165547013282776, + "num_tokens": 481318875.0, + "step": 18600 + }, + { + "epoch": 2.042719086316714, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.1563355922698975, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7063750624656677, + "num_tokens": 481350672.0, + "step": 18601 + }, + { + "epoch": 2.042828904019328, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.8830349445343018, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6958423852920532, + "num_tokens": 481373624.0, + "step": 18602 + }, + { + "epoch": 2.0429387217219417, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.8174753189086914, + "learning_rate": 1e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.753320574760437, + "num_tokens": 481392991.0, + "step": 18603 + }, + { + "epoch": 2.0430485394245554, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 7.050581455230713, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7178903818130493, + "num_tokens": 481413218.0, + "step": 18604 + }, + { + "epoch": 2.0431583571271688, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.37109375, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6993085145950317, + "num_tokens": 481441656.0, + "step": 18605 + }, + { + "epoch": 2.0432681748297825, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 3.0286483764648438, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7266483306884766, + "num_tokens": 481459766.0, + "step": 18606 + }, + { + "epoch": 2.0433779925323963, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.268864631652832, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7216618061065674, + "num_tokens": 481486822.0, + "step": 18607 + }, + { + "epoch": 2.04348781023501, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.69162917137146, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7310971617698669, + "num_tokens": 481508810.0, + "step": 18608 + }, + { + "epoch": 2.0435976279376233, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3492324352264404, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7062618732452393, + "num_tokens": 481535761.0, + "step": 18609 + }, + { + "epoch": 2.043707445640237, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.0718119144439697, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7437429428100586, + "num_tokens": 481567869.0, + "step": 18610 + }, + { + "epoch": 2.043817263342851, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2960262298583984, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6994686126708984, + "num_tokens": 481596756.0, + "step": 18611 + }, + { + "epoch": 2.0439270810454646, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3739876747131348, + "learning_rate": 1e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.6838321685791016, + "num_tokens": 481625764.0, + "step": 18612 + }, + { + "epoch": 2.0440368987480784, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3669426441192627, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7050805687904358, + "num_tokens": 481654419.0, + "step": 18613 + }, + { + "epoch": 2.0441467164506917, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.70660400390625, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7285513877868652, + "num_tokens": 481673690.0, + "step": 18614 + }, + { + "epoch": 2.0442565341533054, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.395322322845459, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.723320722579956, + "num_tokens": 481700537.0, + "step": 18615 + }, + { + "epoch": 2.044366351855919, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.648205518722534, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.732516348361969, + "num_tokens": 481721716.0, + "step": 18616 + }, + { + "epoch": 2.044476169558533, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.590367555618286, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7195144891738892, + "num_tokens": 481744694.0, + "step": 18617 + }, + { + "epoch": 2.0445859872611467, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3675646781921387, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7175817489624023, + "num_tokens": 481773619.0, + "step": 18618 + }, + { + "epoch": 2.04469580496376, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6767566204071045, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7270563840866089, + "num_tokens": 481796860.0, + "step": 18619 + }, + { + "epoch": 2.044805622666374, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3374321460723877, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7158529758453369, + "num_tokens": 481824971.0, + "step": 18620 + }, + { + "epoch": 2.0449154403689875, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4539146423339844, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7208179235458374, + "num_tokens": 481851380.0, + "step": 18621 + }, + { + "epoch": 2.0450252580716013, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5480902194976807, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.730076789855957, + "num_tokens": 481875181.0, + "step": 18622 + }, + { + "epoch": 2.0451350757742146, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4865095615386963, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6951369643211365, + "num_tokens": 481901916.0, + "step": 18623 + }, + { + "epoch": 2.0452448934768284, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.375627040863037, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7067120671272278, + "num_tokens": 481929392.0, + "step": 18624 + }, + { + "epoch": 2.045354711179442, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4858999252319336, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7153175473213196, + "num_tokens": 481955024.0, + "step": 18625 + }, + { + "epoch": 2.045464528882056, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.408482313156128, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7213947176933289, + "num_tokens": 481982868.0, + "step": 18626 + }, + { + "epoch": 2.0455743465846696, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3754544258117676, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.703811764717102, + "num_tokens": 482010241.0, + "step": 18627 + }, + { + "epoch": 2.045684164287283, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.352938413619995, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6983916163444519, + "num_tokens": 482038554.0, + "step": 18628 + }, + { + "epoch": 2.0457939819898967, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.147581100463867, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7039607763290405, + "num_tokens": 482071571.0, + "step": 18629 + }, + { + "epoch": 2.0459037996925105, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5774052143096924, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6949780583381653, + "num_tokens": 482094918.0, + "step": 18630 + }, + { + "epoch": 2.046013617395124, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.939467668533325, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7045741081237793, + "num_tokens": 482113080.0, + "step": 18631 + }, + { + "epoch": 2.046123435097738, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.327450752258301, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7289283275604248, + "num_tokens": 482139261.0, + "step": 18632 + }, + { + "epoch": 2.0462332528003513, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.834104299545288, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7240587472915649, + "num_tokens": 482159933.0, + "step": 18633 + }, + { + "epoch": 2.046343070502965, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.612274646759033, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7108787298202515, + "num_tokens": 482184511.0, + "step": 18634 + }, + { + "epoch": 2.046452888205579, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3250555992126465, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.710767924785614, + "num_tokens": 482214353.0, + "step": 18635 + }, + { + "epoch": 2.0465627059081926, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.510153293609619, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7407317161560059, + "num_tokens": 482238019.0, + "step": 18636 + }, + { + "epoch": 2.046672523610806, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6020848751068115, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7144445180892944, + "num_tokens": 482263259.0, + "step": 18637 + }, + { + "epoch": 2.0467823413134196, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1537466049194336, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7160075306892395, + "num_tokens": 482297651.0, + "step": 18638 + }, + { + "epoch": 2.0468921590160334, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3630480766296387, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6832995414733887, + "num_tokens": 482326127.0, + "step": 18639 + }, + { + "epoch": 2.047001976718647, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2520694732666016, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7150844931602478, + "num_tokens": 482355648.0, + "step": 18640 + }, + { + "epoch": 2.047111794421261, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 3.645913600921631, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7234677672386169, + "num_tokens": 482393914.0, + "step": 18641 + }, + { + "epoch": 2.047221612123874, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4420042037963867, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7048813700675964, + "num_tokens": 482419669.0, + "step": 18642 + }, + { + "epoch": 2.047331429826488, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6706392765045166, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7153015732765198, + "num_tokens": 482441513.0, + "step": 18643 + }, + { + "epoch": 2.0474412475291017, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.494317054748535, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7029589414596558, + "num_tokens": 482467869.0, + "step": 18644 + }, + { + "epoch": 2.0475510652317155, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3265771865844727, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7233759164810181, + "num_tokens": 482496206.0, + "step": 18645 + }, + { + "epoch": 2.047660882934329, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4599177837371826, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.702791690826416, + "num_tokens": 482521093.0, + "step": 18646 + }, + { + "epoch": 2.0477707006369426, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.510768413543701, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7185072302818298, + "num_tokens": 482545337.0, + "step": 18647 + }, + { + "epoch": 2.0478805183395563, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.268747329711914, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7437992095947266, + "num_tokens": 482572245.0, + "step": 18648 + }, + { + "epoch": 2.04799033604217, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2859480381011963, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.726614236831665, + "num_tokens": 482600882.0, + "step": 18649 + }, + { + "epoch": 2.048100153744784, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.453490972518921, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.727613091468811, + "num_tokens": 482626059.0, + "step": 18650 + }, + { + "epoch": 2.048209971447397, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 3.800269365310669, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6943470239639282, + "num_tokens": 482654852.0, + "step": 18651 + }, + { + "epoch": 2.048319789150011, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.4405722618103027, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7337340712547302, + "num_tokens": 482681155.0, + "step": 18652 + }, + { + "epoch": 2.0484296068526247, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.8969733715057373, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7334538698196411, + "num_tokens": 482699738.0, + "step": 18653 + }, + { + "epoch": 2.0485394245552384, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.34499454498291, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7063392996788025, + "num_tokens": 482729568.0, + "step": 18654 + }, + { + "epoch": 2.048649242257852, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5792453289031982, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7303398847579956, + "num_tokens": 482752127.0, + "step": 18655 + }, + { + "epoch": 2.0487590599604655, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.53922963142395, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7211122512817383, + "num_tokens": 482775429.0, + "step": 18656 + }, + { + "epoch": 2.0488688776630792, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.2577273845672607, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7013278007507324, + "num_tokens": 482803518.0, + "step": 18657 + }, + { + "epoch": 2.048978695365693, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7739107608795166, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7102653980255127, + "num_tokens": 482829798.0, + "step": 18658 + }, + { + "epoch": 2.0490885130683067, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3484416007995605, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7206243276596069, + "num_tokens": 482858683.0, + "step": 18659 + }, + { + "epoch": 2.04919833077092, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.352781295776367, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7085287570953369, + "num_tokens": 482884621.0, + "step": 18660 + }, + { + "epoch": 2.049308148473534, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.5242011547088623, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7184451818466187, + "num_tokens": 482910212.0, + "step": 18661 + }, + { + "epoch": 2.0494179661761476, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.243178606033325, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7308710217475891, + "num_tokens": 482938617.0, + "step": 18662 + }, + { + "epoch": 2.0495277838787613, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2878496646881104, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7182782292366028, + "num_tokens": 482968113.0, + "step": 18663 + }, + { + "epoch": 2.049637601581375, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.365554094314575, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6959157586097717, + "num_tokens": 482998324.0, + "step": 18664 + }, + { + "epoch": 2.0497474192839884, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3977317810058594, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7017958164215088, + "num_tokens": 483025136.0, + "step": 18665 + }, + { + "epoch": 2.049857236986602, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.566366672515869, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7241367101669312, + "num_tokens": 483048026.0, + "step": 18666 + }, + { + "epoch": 2.049967054689216, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.722285509109497, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7396660447120667, + "num_tokens": 483069609.0, + "step": 18667 + }, + { + "epoch": 2.0500768723918297, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.559922695159912, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7240691184997559, + "num_tokens": 483092995.0, + "step": 18668 + }, + { + "epoch": 2.0501866900944434, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 2.3318605422973633, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7116408348083496, + "num_tokens": 483119666.0, + "step": 18669 + }, + { + "epoch": 2.0502965077970567, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1861255168914795, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7482961416244507, + "num_tokens": 483150595.0, + "step": 18670 + }, + { + "epoch": 2.0504063254996705, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5264973640441895, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7279338836669922, + "num_tokens": 483173866.0, + "step": 18671 + }, + { + "epoch": 2.0505161432022843, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.841431140899658, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7071710824966431, + "num_tokens": 483195512.0, + "step": 18672 + }, + { + "epoch": 2.050625960904898, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.264139175415039, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7350925207138062, + "num_tokens": 483224278.0, + "step": 18673 + }, + { + "epoch": 2.0507357786075113, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3741042613983154, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7215331196784973, + "num_tokens": 483251606.0, + "step": 18674 + }, + { + "epoch": 2.050845596310125, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.606640338897705, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.710658073425293, + "num_tokens": 483276901.0, + "step": 18675 + }, + { + "epoch": 2.050955414012739, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3735077381134033, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7286181449890137, + "num_tokens": 483304570.0, + "step": 18676 + }, + { + "epoch": 2.0510652317153526, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3532962799072266, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.741328775882721, + "num_tokens": 483330328.0, + "step": 18677 + }, + { + "epoch": 2.0511750494179664, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4855830669403076, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6903591156005859, + "num_tokens": 483356872.0, + "step": 18678 + }, + { + "epoch": 2.0512848671205797, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.519007444381714, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.7055033445358276, + "num_tokens": 483382817.0, + "step": 18679 + }, + { + "epoch": 2.0513946848231934, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2574524879455566, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7388884425163269, + "num_tokens": 483411726.0, + "step": 18680 + }, + { + "epoch": 2.051504502525807, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.313321828842163, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7181597948074341, + "num_tokens": 483440186.0, + "step": 18681 + }, + { + "epoch": 2.051614320228421, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.575366973876953, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7242673635482788, + "num_tokens": 483463864.0, + "step": 18682 + }, + { + "epoch": 2.0517241379310347, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.491532802581787, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7162556052207947, + "num_tokens": 483488716.0, + "step": 18683 + }, + { + "epoch": 2.051833955633648, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3509957790374756, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7015825510025024, + "num_tokens": 483516897.0, + "step": 18684 + }, + { + "epoch": 2.0519437733362618, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7260892391204834, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7231272459030151, + "num_tokens": 483539481.0, + "step": 18685 + }, + { + "epoch": 2.0520535910388755, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.573448896408081, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7368381023406982, + "num_tokens": 483562160.0, + "step": 18686 + }, + { + "epoch": 2.0521634087414893, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.597407579421997, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.707697331905365, + "num_tokens": 483587201.0, + "step": 18687 + }, + { + "epoch": 2.0522732264441026, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.635584592819214, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7195655703544617, + "num_tokens": 483610822.0, + "step": 18688 + }, + { + "epoch": 2.0523830441467164, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.677093029022217, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7112469673156738, + "num_tokens": 483632577.0, + "step": 18689 + }, + { + "epoch": 2.05249286184933, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.429400682449341, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7065246105194092, + "num_tokens": 483659643.0, + "step": 18690 + }, + { + "epoch": 2.052602679551944, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.56103253364563, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7163551449775696, + "num_tokens": 483685053.0, + "step": 18691 + }, + { + "epoch": 2.0527124972545576, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7760660648345947, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7129194736480713, + "num_tokens": 483708837.0, + "step": 18692 + }, + { + "epoch": 2.052822314957171, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5762383937835693, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7189306020736694, + "num_tokens": 483732545.0, + "step": 18693 + }, + { + "epoch": 2.0529321326597847, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6079213619232178, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7315773963928223, + "num_tokens": 483755112.0, + "step": 18694 + }, + { + "epoch": 2.0530419503623984, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6743011474609375, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.723832905292511, + "num_tokens": 483777305.0, + "step": 18695 + }, + { + "epoch": 2.053151768065012, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6137819290161133, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.742294192314148, + "num_tokens": 483800500.0, + "step": 18696 + }, + { + "epoch": 2.0532615857676255, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4871878623962402, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7187016606330872, + "num_tokens": 483826162.0, + "step": 18697 + }, + { + "epoch": 2.0533714034702393, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.216658353805542, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7199921607971191, + "num_tokens": 483855322.0, + "step": 18698 + }, + { + "epoch": 2.053481221172853, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7541913986206055, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7236790657043457, + "num_tokens": 483877944.0, + "step": 18699 + }, + { + "epoch": 2.053591038875467, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.389296531677246, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7194235324859619, + "num_tokens": 483904372.0, + "step": 18700 + }, + { + "epoch": 2.0537008565780805, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4935481548309326, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6908326148986816, + "num_tokens": 483933983.0, + "step": 18701 + }, + { + "epoch": 2.053810674280694, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5275208950042725, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7086765766143799, + "num_tokens": 483960201.0, + "step": 18702 + }, + { + "epoch": 2.0539204919833076, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.270116090774536, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7019822597503662, + "num_tokens": 483990681.0, + "step": 18703 + }, + { + "epoch": 2.0540303096859214, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.34859037399292, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7189965844154358, + "num_tokens": 484019494.0, + "step": 18704 + }, + { + "epoch": 2.054140127388535, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5369980335235596, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7150281667709351, + "num_tokens": 484044720.0, + "step": 18705 + }, + { + "epoch": 2.054249945091149, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4042558670043945, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7187596559524536, + "num_tokens": 484071022.0, + "step": 18706 + }, + { + "epoch": 2.054359762793762, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7776801586151123, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7440488934516907, + "num_tokens": 484089798.0, + "step": 18707 + }, + { + "epoch": 2.054469580496376, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5244174003601074, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7229658365249634, + "num_tokens": 484115280.0, + "step": 18708 + }, + { + "epoch": 2.0545793981989897, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.603994846343994, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7094014883041382, + "num_tokens": 484139932.0, + "step": 18709 + }, + { + "epoch": 2.0546892159016035, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.198011636734009, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.737743616104126, + "num_tokens": 484170720.0, + "step": 18710 + }, + { + "epoch": 2.054799033604217, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2832608222961426, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.70970219373703, + "num_tokens": 484199463.0, + "step": 18711 + }, + { + "epoch": 2.0549088513068305, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6463332176208496, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7283903360366821, + "num_tokens": 484223309.0, + "step": 18712 + }, + { + "epoch": 2.0550186690094443, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4295754432678223, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6979783773422241, + "num_tokens": 484252023.0, + "step": 18713 + }, + { + "epoch": 2.055128486712058, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.629845380783081, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.718920111656189, + "num_tokens": 484276029.0, + "step": 18714 + }, + { + "epoch": 2.055238304414672, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4124081134796143, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6978427171707153, + "num_tokens": 484305587.0, + "step": 18715 + }, + { + "epoch": 2.055348122117285, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.9583284854888916, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7412062883377075, + "num_tokens": 484324145.0, + "step": 18716 + }, + { + "epoch": 2.055457939819899, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.228191614151001, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7106272578239441, + "num_tokens": 484356125.0, + "step": 18717 + }, + { + "epoch": 2.0555677575225126, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3848299980163574, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7130330801010132, + "num_tokens": 484385600.0, + "step": 18718 + }, + { + "epoch": 2.0556775752251264, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.638411283493042, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7403346300125122, + "num_tokens": 484410154.0, + "step": 18719 + }, + { + "epoch": 2.05578739292774, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5737054347991943, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7328952550888062, + "num_tokens": 484433103.0, + "step": 18720 + }, + { + "epoch": 2.0558972106303535, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2843270301818848, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.714786171913147, + "num_tokens": 484461954.0, + "step": 18721 + }, + { + "epoch": 2.0560070283329672, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4900474548339844, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7451313138008118, + "num_tokens": 484484988.0, + "step": 18722 + }, + { + "epoch": 2.056116846035581, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3328850269317627, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7105100154876709, + "num_tokens": 484516484.0, + "step": 18723 + }, + { + "epoch": 2.0562266637381947, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.566340208053589, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7102503180503845, + "num_tokens": 484539532.0, + "step": 18724 + }, + { + "epoch": 2.056336481440808, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.664639949798584, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7128841876983643, + "num_tokens": 484564198.0, + "step": 18725 + }, + { + "epoch": 2.056446299143422, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4135630130767822, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7091322541236877, + "num_tokens": 484590622.0, + "step": 18726 + }, + { + "epoch": 2.0565561168460356, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3518872261047363, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6852473616600037, + "num_tokens": 484619910.0, + "step": 18727 + }, + { + "epoch": 2.0566659345486493, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.8784968852996826, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.722465991973877, + "num_tokens": 484640093.0, + "step": 18728 + }, + { + "epoch": 2.056775752251263, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4296741485595703, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.706524133682251, + "num_tokens": 484669591.0, + "step": 18729 + }, + { + "epoch": 2.0568855699538764, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.610093832015991, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7146207094192505, + "num_tokens": 484693323.0, + "step": 18730 + }, + { + "epoch": 2.05699538765649, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7642416954040527, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7461597919464111, + "num_tokens": 484713895.0, + "step": 18731 + }, + { + "epoch": 2.057105205359104, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7059733867645264, + "learning_rate": 1e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7566227316856384, + "num_tokens": 484734883.0, + "step": 18732 + }, + { + "epoch": 2.0572150230617177, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.599614381790161, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7241793870925903, + "num_tokens": 484757425.0, + "step": 18733 + }, + { + "epoch": 2.0573248407643314, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.414710521697998, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7182198166847229, + "num_tokens": 484782516.0, + "step": 18734 + }, + { + "epoch": 2.0574346584669447, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.76237416267395, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.728294849395752, + "num_tokens": 484805520.0, + "step": 18735 + }, + { + "epoch": 2.0575444761695585, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4297590255737305, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6919357776641846, + "num_tokens": 484833321.0, + "step": 18736 + }, + { + "epoch": 2.0576542938721722, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1992878913879395, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6820827126502991, + "num_tokens": 484868316.0, + "step": 18737 + }, + { + "epoch": 2.057764111574786, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.489410638809204, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7221099138259888, + "num_tokens": 484892628.0, + "step": 18738 + }, + { + "epoch": 2.0578739292773993, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.410452127456665, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7119406461715698, + "num_tokens": 484919840.0, + "step": 18739 + }, + { + "epoch": 2.057983746980013, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3856940269470215, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7250043153762817, + "num_tokens": 484949046.0, + "step": 18740 + }, + { + "epoch": 2.058093564682627, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5637409687042236, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7068790197372437, + "num_tokens": 484973166.0, + "step": 18741 + }, + { + "epoch": 2.0582033823852406, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.438311815261841, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7053665518760681, + "num_tokens": 484997745.0, + "step": 18742 + }, + { + "epoch": 2.0583132000878543, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1998374462127686, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7185836434364319, + "num_tokens": 485030912.0, + "step": 18743 + }, + { + "epoch": 2.0584230177904677, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.377141237258911, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7203764915466309, + "num_tokens": 485059141.0, + "step": 18744 + }, + { + "epoch": 2.0585328354930814, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.536766529083252, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.72945237159729, + "num_tokens": 485083742.0, + "step": 18745 + }, + { + "epoch": 2.058642653195695, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.871201753616333, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7263468503952026, + "num_tokens": 485105674.0, + "step": 18746 + }, + { + "epoch": 2.058752470898309, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3368921279907227, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7164780497550964, + "num_tokens": 485132428.0, + "step": 18747 + }, + { + "epoch": 2.0588622886009227, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.623389959335327, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7273455858230591, + "num_tokens": 485155653.0, + "step": 18748 + }, + { + "epoch": 2.058972106303536, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4578866958618164, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7161599397659302, + "num_tokens": 485183472.0, + "step": 18749 + }, + { + "epoch": 2.0590819240061498, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3859288692474365, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7105779647827148, + "num_tokens": 485208921.0, + "step": 18750 + }, + { + "epoch": 2.0591917417087635, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1577463150024414, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7072261571884155, + "num_tokens": 485243201.0, + "step": 18751 + }, + { + "epoch": 2.0593015594113773, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.260206699371338, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.719031810760498, + "num_tokens": 485274135.0, + "step": 18752 + }, + { + "epoch": 2.0594113771139906, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4084205627441406, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7106552720069885, + "num_tokens": 485303106.0, + "step": 18753 + }, + { + "epoch": 2.0595211948166043, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5306849479675293, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7201077938079834, + "num_tokens": 485326573.0, + "step": 18754 + }, + { + "epoch": 2.059631012519218, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.390364170074463, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7185309529304504, + "num_tokens": 485354403.0, + "step": 18755 + }, + { + "epoch": 2.059740830221832, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.489665985107422, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7081375122070312, + "num_tokens": 485381116.0, + "step": 18756 + }, + { + "epoch": 2.0598506479244456, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.520010471343994, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7316159009933472, + "num_tokens": 485404852.0, + "step": 18757 + }, + { + "epoch": 2.059960465627059, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4696640968322754, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7314860820770264, + "num_tokens": 485429368.0, + "step": 18758 + }, + { + "epoch": 2.0600702833296727, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1477229595184326, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7069071531295776, + "num_tokens": 485462570.0, + "step": 18759 + }, + { + "epoch": 2.0601801010322864, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.39150333404541, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7357202768325806, + "num_tokens": 485486324.0, + "step": 18760 + }, + { + "epoch": 2.0602899187349, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.467918634414673, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7200395464897156, + "num_tokens": 485514079.0, + "step": 18761 + }, + { + "epoch": 2.060399736437514, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4846277236938477, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7011374235153198, + "num_tokens": 485539712.0, + "step": 18762 + }, + { + "epoch": 2.0605095541401273, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4619827270507812, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7031959295272827, + "num_tokens": 485565998.0, + "step": 18763 + }, + { + "epoch": 2.060619371842741, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.632901430130005, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7053667306900024, + "num_tokens": 485590371.0, + "step": 18764 + }, + { + "epoch": 2.060729189545355, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.537353277206421, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7217441201210022, + "num_tokens": 485613661.0, + "step": 18765 + }, + { + "epoch": 2.0608390072479685, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7229881286621094, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7213326692581177, + "num_tokens": 485640582.0, + "step": 18766 + }, + { + "epoch": 2.060948824950582, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.424741506576538, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.700964629650116, + "num_tokens": 485667177.0, + "step": 18767 + }, + { + "epoch": 2.0610586426531956, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.302042007446289, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7267310619354248, + "num_tokens": 485692337.0, + "step": 18768 + }, + { + "epoch": 2.0611684603558094, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.570002317428589, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7085430026054382, + "num_tokens": 485715895.0, + "step": 18769 + }, + { + "epoch": 2.061278278058423, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4652504920959473, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7102135419845581, + "num_tokens": 485740056.0, + "step": 18770 + }, + { + "epoch": 2.061388095761037, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4723715782165527, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7242907285690308, + "num_tokens": 485765531.0, + "step": 18771 + }, + { + "epoch": 2.06149791346365, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.321115016937256, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7213643193244934, + "num_tokens": 485790828.0, + "step": 18772 + }, + { + "epoch": 2.061607731166264, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3462400436401367, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6990019083023071, + "num_tokens": 485821307.0, + "step": 18773 + }, + { + "epoch": 2.0617175488688777, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.537715435028076, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7319663166999817, + "num_tokens": 485842368.0, + "step": 18774 + }, + { + "epoch": 2.0618273665714915, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3471248149871826, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7168152332305908, + "num_tokens": 485869451.0, + "step": 18775 + }, + { + "epoch": 2.0619371842741048, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.525846481323242, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7103211879730225, + "num_tokens": 485894433.0, + "step": 18776 + }, + { + "epoch": 2.0620470019767185, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.8688292503356934, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7286839485168457, + "num_tokens": 485915047.0, + "step": 18777 + }, + { + "epoch": 2.0621568196793323, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.556550979614258, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7116218209266663, + "num_tokens": 485942877.0, + "step": 18778 + }, + { + "epoch": 2.062266637381946, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.578023672103882, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7173593044281006, + "num_tokens": 485966534.0, + "step": 18779 + }, + { + "epoch": 2.06237645508456, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.8537800312042236, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7304922342300415, + "num_tokens": 485985634.0, + "step": 18780 + }, + { + "epoch": 2.062486272787173, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.582502603530884, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7167063355445862, + "num_tokens": 486012183.0, + "step": 18781 + }, + { + "epoch": 2.062596090489787, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.374530076980591, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7059206962585449, + "num_tokens": 486041381.0, + "step": 18782 + }, + { + "epoch": 2.0627059081924006, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.429840564727783, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7252644300460815, + "num_tokens": 486067899.0, + "step": 18783 + }, + { + "epoch": 2.0628157258950144, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5989630222320557, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7197760939598083, + "num_tokens": 486090785.0, + "step": 18784 + }, + { + "epoch": 2.062925543597628, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.343466281890869, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7154871225357056, + "num_tokens": 486117247.0, + "step": 18785 + }, + { + "epoch": 2.0630353613002415, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.473452568054199, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7165619730949402, + "num_tokens": 486142138.0, + "step": 18786 + }, + { + "epoch": 2.063145179002855, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.917440176010132, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7290794253349304, + "num_tokens": 486162436.0, + "step": 18787 + }, + { + "epoch": 2.063254996705469, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.8857836723327637, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7384116649627686, + "num_tokens": 486183170.0, + "step": 18788 + }, + { + "epoch": 2.0633648144080827, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.1724839210510254, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6991912126541138, + "num_tokens": 486219250.0, + "step": 18789 + }, + { + "epoch": 2.063474632110696, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.334855079650879, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7070789933204651, + "num_tokens": 486248913.0, + "step": 18790 + }, + { + "epoch": 2.06358444981331, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7112157344818115, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7261821031570435, + "num_tokens": 486270415.0, + "step": 18791 + }, + { + "epoch": 2.0636942675159236, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4614062309265137, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6945931911468506, + "num_tokens": 486298050.0, + "step": 18792 + }, + { + "epoch": 2.0638040852185373, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5731866359710693, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7079900503158569, + "num_tokens": 486321495.0, + "step": 18793 + }, + { + "epoch": 2.063913902921151, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4808876514434814, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7246444225311279, + "num_tokens": 486346931.0, + "step": 18794 + }, + { + "epoch": 2.0640237206237644, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.436245918273926, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.7023734450340271, + "num_tokens": 486374258.0, + "step": 18795 + }, + { + "epoch": 2.064133538326378, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3497138023376465, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7083960771560669, + "num_tokens": 486401074.0, + "step": 18796 + }, + { + "epoch": 2.064243356028992, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5407321453094482, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7258525490760803, + "num_tokens": 486425411.0, + "step": 18797 + }, + { + "epoch": 2.0643531737316057, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5155599117279053, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7157049775123596, + "num_tokens": 486450354.0, + "step": 18798 + }, + { + "epoch": 2.0644629914342194, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.32072377204895, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7293634414672852, + "num_tokens": 486479627.0, + "step": 18799 + }, + { + "epoch": 2.0645728091368327, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6130924224853516, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7199828624725342, + "num_tokens": 486502869.0, + "step": 18800 + }, + { + "epoch": 2.0646826268394465, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.2040646076202393, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.7038687467575073, + "num_tokens": 486534327.0, + "step": 18801 + }, + { + "epoch": 2.0647924445420602, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5231988430023193, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.7023245096206665, + "num_tokens": 486559490.0, + "step": 18802 + }, + { + "epoch": 2.064902262244674, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.7486302852630615, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7194530367851257, + "num_tokens": 486579601.0, + "step": 18803 + }, + { + "epoch": 2.0650120799472873, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.6219534873962402, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7418581247329712, + "num_tokens": 486600957.0, + "step": 18804 + }, + { + "epoch": 2.065121897649901, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.4786734580993652, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7215590476989746, + "num_tokens": 486627356.0, + "step": 18805 + }, + { + "epoch": 2.065231715352515, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.414658546447754, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7212303280830383, + "num_tokens": 486653002.0, + "step": 18806 + }, + { + "epoch": 2.0653415330551286, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.451026678085327, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7295306921005249, + "num_tokens": 486677465.0, + "step": 18807 + }, + { + "epoch": 2.0654513507577423, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3794052600860596, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7091063261032104, + "num_tokens": 486704398.0, + "step": 18808 + }, + { + "epoch": 2.0655611684603556, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.239560842514038, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7128613591194153, + "num_tokens": 486735319.0, + "step": 18809 + }, + { + "epoch": 2.0656709861629694, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6967828273773193, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7087267637252808, + "num_tokens": 486756914.0, + "step": 18810 + }, + { + "epoch": 2.065780803865583, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.44391131401062, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7283247709274292, + "num_tokens": 486781933.0, + "step": 18811 + }, + { + "epoch": 2.065890621568197, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3420140743255615, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7135359048843384, + "num_tokens": 486808660.0, + "step": 18812 + }, + { + "epoch": 2.0660004392708107, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.2316157817840576, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7036586999893188, + "num_tokens": 486844039.0, + "step": 18813 + }, + { + "epoch": 2.066110256973424, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.1762378215789795, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7115804553031921, + "num_tokens": 486878633.0, + "step": 18814 + }, + { + "epoch": 2.0662200746760377, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.341554880142212, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7255913019180298, + "num_tokens": 486907779.0, + "step": 18815 + }, + { + "epoch": 2.0663298923786515, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.477330207824707, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.7015149593353271, + "num_tokens": 486938039.0, + "step": 18816 + }, + { + "epoch": 2.0664397100812653, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.320871353149414, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6960576176643372, + "num_tokens": 486967462.0, + "step": 18817 + }, + { + "epoch": 2.0665495277838786, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.369823694229126, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7323243618011475, + "num_tokens": 486994806.0, + "step": 18818 + }, + { + "epoch": 2.0666593454864923, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4240128993988037, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6974664926528931, + "num_tokens": 487021312.0, + "step": 18819 + }, + { + "epoch": 2.066769163189106, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6474132537841797, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7214559316635132, + "num_tokens": 487048531.0, + "step": 18820 + }, + { + "epoch": 2.06687898089172, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.467979907989502, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7177738547325134, + "num_tokens": 487074614.0, + "step": 18821 + }, + { + "epoch": 2.0669887985943336, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.540940999984741, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7188619375228882, + "num_tokens": 487097613.0, + "step": 18822 + }, + { + "epoch": 2.067098616296947, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.610095262527466, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7089680433273315, + "num_tokens": 487124192.0, + "step": 18823 + }, + { + "epoch": 2.0672084339995607, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.263899326324463, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6853399276733398, + "num_tokens": 487157659.0, + "step": 18824 + }, + { + "epoch": 2.0673182517021744, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.235830545425415, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7424525618553162, + "num_tokens": 487186665.0, + "step": 18825 + }, + { + "epoch": 2.067428069404788, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4923641681671143, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7395226955413818, + "num_tokens": 487213021.0, + "step": 18826 + }, + { + "epoch": 2.0675378871074015, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5804967880249023, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.70991051197052, + "num_tokens": 487236826.0, + "step": 18827 + }, + { + "epoch": 2.0676477048100153, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.703815221786499, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7501174211502075, + "num_tokens": 487257249.0, + "step": 18828 + }, + { + "epoch": 2.067757522512629, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5658936500549316, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7248501181602478, + "num_tokens": 487279719.0, + "step": 18829 + }, + { + "epoch": 2.0678673402152428, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3185102939605713, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7146083116531372, + "num_tokens": 487305998.0, + "step": 18830 + }, + { + "epoch": 2.0679771579178565, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.3072707653045654, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7313477396965027, + "num_tokens": 487335168.0, + "step": 18831 + }, + { + "epoch": 2.06808697562047, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.5247013568878174, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7198250889778137, + "num_tokens": 487359653.0, + "step": 18832 + }, + { + "epoch": 2.0681967933230836, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.715697765350342, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7325245141983032, + "num_tokens": 487379403.0, + "step": 18833 + }, + { + "epoch": 2.0683066110256974, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 2.513636589050293, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7172925472259521, + "num_tokens": 487403805.0, + "step": 18834 + }, + { + "epoch": 2.068416428728311, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3892648220062256, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7194141149520874, + "num_tokens": 487429302.0, + "step": 18835 + }, + { + "epoch": 2.068526246430925, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.368586301803589, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7399588823318481, + "num_tokens": 487454107.0, + "step": 18836 + }, + { + "epoch": 2.068636064133538, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.454608201980591, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.723268449306488, + "num_tokens": 487480299.0, + "step": 18837 + }, + { + "epoch": 2.068745881836152, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4736831188201904, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7077188491821289, + "num_tokens": 487507207.0, + "step": 18838 + }, + { + "epoch": 2.0688556995387657, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4635043144226074, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7312096953392029, + "num_tokens": 487532758.0, + "step": 18839 + }, + { + "epoch": 2.0689655172413794, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.638003349304199, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7315854430198669, + "num_tokens": 487555940.0, + "step": 18840 + }, + { + "epoch": 2.0690753349439928, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.531855821609497, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.708008348941803, + "num_tokens": 487581254.0, + "step": 18841 + }, + { + "epoch": 2.0691851526466065, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.8356423377990723, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7373124361038208, + "num_tokens": 487600889.0, + "step": 18842 + }, + { + "epoch": 2.0692949703492203, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.892374277114868, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7387350797653198, + "num_tokens": 487620106.0, + "step": 18843 + }, + { + "epoch": 2.069404788051834, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.1840827465057373, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7380983829498291, + "num_tokens": 487652019.0, + "step": 18844 + }, + { + "epoch": 2.069514605754448, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.2030956745147705, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7196469902992249, + "num_tokens": 487682072.0, + "step": 18845 + }, + { + "epoch": 2.069624423457061, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.192790985107422, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7121302485466003, + "num_tokens": 487712303.0, + "step": 18846 + }, + { + "epoch": 2.069734241159675, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.633958339691162, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7270622253417969, + "num_tokens": 487733598.0, + "step": 18847 + }, + { + "epoch": 2.0698440588622886, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.431344985961914, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7252333760261536, + "num_tokens": 487758116.0, + "step": 18848 + }, + { + "epoch": 2.0699538765649024, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.398580551147461, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7015398144721985, + "num_tokens": 487784928.0, + "step": 18849 + }, + { + "epoch": 2.070063694267516, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5152456760406494, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7192504405975342, + "num_tokens": 487806999.0, + "step": 18850 + }, + { + "epoch": 2.0701735119701294, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.50454044342041, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7384036779403687, + "num_tokens": 487829487.0, + "step": 18851 + }, + { + "epoch": 2.070283329672743, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.34597110748291, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7102349400520325, + "num_tokens": 487857701.0, + "step": 18852 + }, + { + "epoch": 2.070393147375357, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.2869465351104736, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7106245756149292, + "num_tokens": 487886019.0, + "step": 18853 + }, + { + "epoch": 2.0705029650779707, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.708590269088745, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7383921146392822, + "num_tokens": 487906054.0, + "step": 18854 + }, + { + "epoch": 2.070612782780584, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3262174129486084, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7284669876098633, + "num_tokens": 487932732.0, + "step": 18855 + }, + { + "epoch": 2.070722600483198, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5856847763061523, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7056686282157898, + "num_tokens": 487954950.0, + "step": 18856 + }, + { + "epoch": 2.0708324181858115, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.0615034103393555, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7145328521728516, + "num_tokens": 487989789.0, + "step": 18857 + }, + { + "epoch": 2.0709422358884253, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.501282215118408, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7162380218505859, + "num_tokens": 488013438.0, + "step": 18858 + }, + { + "epoch": 2.071052053591039, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 3.068162679672241, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7196668386459351, + "num_tokens": 488033967.0, + "step": 18859 + }, + { + "epoch": 2.0711618712936524, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.7735893726348877, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7182250022888184, + "num_tokens": 488055618.0, + "step": 18860 + }, + { + "epoch": 2.071271688996266, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6210479736328125, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.721908688545227, + "num_tokens": 488077824.0, + "step": 18861 + }, + { + "epoch": 2.07138150669888, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.495936632156372, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7066988945007324, + "num_tokens": 488103374.0, + "step": 18862 + }, + { + "epoch": 2.0714913244014936, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3009278774261475, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7314423322677612, + "num_tokens": 488129776.0, + "step": 18863 + }, + { + "epoch": 2.0716011421041074, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.501720428466797, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7089579105377197, + "num_tokens": 488154567.0, + "step": 18864 + }, + { + "epoch": 2.0717109598067207, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.7272794246673584, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.72266685962677, + "num_tokens": 488175092.0, + "step": 18865 + }, + { + "epoch": 2.0718207775093345, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5180270671844482, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6924923062324524, + "num_tokens": 488202329.0, + "step": 18866 + }, + { + "epoch": 2.0719305952119482, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4392850399017334, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7103265523910522, + "num_tokens": 488229448.0, + "step": 18867 + }, + { + "epoch": 2.072040412914562, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.943474531173706, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.722354531288147, + "num_tokens": 488248341.0, + "step": 18868 + }, + { + "epoch": 2.0721502306171753, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3318002223968506, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6974164247512817, + "num_tokens": 488276403.0, + "step": 18869 + }, + { + "epoch": 2.072260048319789, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.471846580505371, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7048503160476685, + "num_tokens": 488300829.0, + "step": 18870 + }, + { + "epoch": 2.072369866022403, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.308335542678833, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.7071315050125122, + "num_tokens": 488330090.0, + "step": 18871 + }, + { + "epoch": 2.0724796837250166, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3424901962280273, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7199360132217407, + "num_tokens": 488357430.0, + "step": 18872 + }, + { + "epoch": 2.0725895014276303, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.462960958480835, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7171357274055481, + "num_tokens": 488382389.0, + "step": 18873 + }, + { + "epoch": 2.0726993191302436, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5889782905578613, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7305527925491333, + "num_tokens": 488405819.0, + "step": 18874 + }, + { + "epoch": 2.0728091368328574, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.588522434234619, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7132083177566528, + "num_tokens": 488428902.0, + "step": 18875 + }, + { + "epoch": 2.072918954535471, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.562931537628174, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7358894348144531, + "num_tokens": 488451267.0, + "step": 18876 + }, + { + "epoch": 2.073028772238085, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.310373067855835, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7216464281082153, + "num_tokens": 488477786.0, + "step": 18877 + }, + { + "epoch": 2.073138589940698, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.492774724960327, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7311543822288513, + "num_tokens": 488501779.0, + "step": 18878 + }, + { + "epoch": 2.073248407643312, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5576751232147217, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7144990563392639, + "num_tokens": 488525927.0, + "step": 18879 + }, + { + "epoch": 2.0733582253459257, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6646642684936523, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7181954979896545, + "num_tokens": 488549374.0, + "step": 18880 + }, + { + "epoch": 2.0734680430485395, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.1964197158813477, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7036492824554443, + "num_tokens": 488581234.0, + "step": 18881 + }, + { + "epoch": 2.0735778607511532, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5908045768737793, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7209210395812988, + "num_tokens": 488604300.0, + "step": 18882 + }, + { + "epoch": 2.0736876784537666, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4761714935302734, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7114077210426331, + "num_tokens": 488628493.0, + "step": 18883 + }, + { + "epoch": 2.0737974961563803, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6819679737091064, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7389671802520752, + "num_tokens": 488649579.0, + "step": 18884 + }, + { + "epoch": 2.073907313858994, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3380393981933594, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7044657468795776, + "num_tokens": 488679453.0, + "step": 18885 + }, + { + "epoch": 2.074017131561608, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3233530521392822, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7235856056213379, + "num_tokens": 488706913.0, + "step": 18886 + }, + { + "epoch": 2.0741269492642216, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.3549630641937256, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7239174842834473, + "num_tokens": 488733613.0, + "step": 18887 + }, + { + "epoch": 2.074236766966835, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.397094964981079, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7062028646469116, + "num_tokens": 488760801.0, + "step": 18888 + }, + { + "epoch": 2.0743465846694487, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.435920238494873, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7082062363624573, + "num_tokens": 488788687.0, + "step": 18889 + }, + { + "epoch": 2.0744564023720624, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.510382652282715, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.718994140625, + "num_tokens": 488814072.0, + "step": 18890 + }, + { + "epoch": 2.074566220074676, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3550848960876465, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7442182302474976, + "num_tokens": 488841485.0, + "step": 18891 + }, + { + "epoch": 2.07467603777729, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4397144317626953, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7250524759292603, + "num_tokens": 488867353.0, + "step": 18892 + }, + { + "epoch": 2.0747858554799032, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4225170612335205, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6886619925498962, + "num_tokens": 488896078.0, + "step": 18893 + }, + { + "epoch": 2.074895673182517, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.7461531162261963, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7158440947532654, + "num_tokens": 488917064.0, + "step": 18894 + }, + { + "epoch": 2.0750054908851308, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.29062819480896, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7101963758468628, + "num_tokens": 488945488.0, + "step": 18895 + }, + { + "epoch": 2.0751153085877445, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4709670543670654, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7220050096511841, + "num_tokens": 488968188.0, + "step": 18896 + }, + { + "epoch": 2.075225126290358, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.8599822521209717, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7271429300308228, + "num_tokens": 488989368.0, + "step": 18897 + }, + { + "epoch": 2.0753349439929716, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.401061773300171, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7093336582183838, + "num_tokens": 489019061.0, + "step": 18898 + }, + { + "epoch": 2.0754447616955853, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4065449237823486, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7022813558578491, + "num_tokens": 489049187.0, + "step": 18899 + }, + { + "epoch": 2.075554579398199, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.589409828186035, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7254878282546997, + "num_tokens": 489072262.0, + "step": 18900 + }, + { + "epoch": 2.075664397100813, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.4133362770080566, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7207088470458984, + "num_tokens": 489098065.0, + "step": 18901 + }, + { + "epoch": 2.075774214803426, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.5046966075897217, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7181112766265869, + "num_tokens": 489122738.0, + "step": 18902 + }, + { + "epoch": 2.07588403250604, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.516047954559326, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7030167579650879, + "num_tokens": 489149658.0, + "step": 18903 + }, + { + "epoch": 2.0759938502086537, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 2.6832263469696045, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7297846078872681, + "num_tokens": 489170115.0, + "step": 18904 + }, + { + "epoch": 2.0761036679112674, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3744277954101562, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6942214965820312, + "num_tokens": 489198899.0, + "step": 18905 + }, + { + "epoch": 2.0762134856138807, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.648646593093872, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7143813371658325, + "num_tokens": 489220994.0, + "step": 18906 + }, + { + "epoch": 2.0763233033164945, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3000872135162354, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7109665870666504, + "num_tokens": 489247684.0, + "step": 18907 + }, + { + "epoch": 2.0764331210191083, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.343869686126709, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7255550622940063, + "num_tokens": 489274000.0, + "step": 18908 + }, + { + "epoch": 2.076542938721722, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3106679916381836, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7217773795127869, + "num_tokens": 489301138.0, + "step": 18909 + }, + { + "epoch": 2.0766527564243358, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6647040843963623, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7197273969650269, + "num_tokens": 489323949.0, + "step": 18910 + }, + { + "epoch": 2.076762574126949, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.489706516265869, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7145916223526001, + "num_tokens": 489348177.0, + "step": 18911 + }, + { + "epoch": 2.076872391829563, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.581610679626465, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7102844715118408, + "num_tokens": 489370653.0, + "step": 18912 + }, + { + "epoch": 2.0769822095321766, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6019303798675537, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7069143056869507, + "num_tokens": 489396076.0, + "step": 18913 + }, + { + "epoch": 2.0770920272347904, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3134639263153076, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7493880391120911, + "num_tokens": 489422322.0, + "step": 18914 + }, + { + "epoch": 2.077201844937404, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3439552783966064, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7032972574234009, + "num_tokens": 489452241.0, + "step": 18915 + }, + { + "epoch": 2.0773116626400174, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.730466842651367, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7250010967254639, + "num_tokens": 489471717.0, + "step": 18916 + }, + { + "epoch": 2.077421480342631, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5978071689605713, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7395715713500977, + "num_tokens": 489493599.0, + "step": 18917 + }, + { + "epoch": 2.077531298045245, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5356059074401855, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6975772380828857, + "num_tokens": 489517991.0, + "step": 18918 + }, + { + "epoch": 2.0776411157478587, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2043495178222656, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6930755376815796, + "num_tokens": 489550331.0, + "step": 18919 + }, + { + "epoch": 2.077750933450472, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.361938714981079, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7066307663917542, + "num_tokens": 489576755.0, + "step": 18920 + }, + { + "epoch": 2.0778607511530858, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4867966175079346, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7246783375740051, + "num_tokens": 489603232.0, + "step": 18921 + }, + { + "epoch": 2.0779705688556995, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3070712089538574, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6940661668777466, + "num_tokens": 489633861.0, + "step": 18922 + }, + { + "epoch": 2.0780803865583133, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.690053701400757, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.71449875831604, + "num_tokens": 489656715.0, + "step": 18923 + }, + { + "epoch": 2.078190204260927, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7064905166625977, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7219600677490234, + "num_tokens": 489678745.0, + "step": 18924 + }, + { + "epoch": 2.0783000219635404, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3697240352630615, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7111859321594238, + "num_tokens": 489705042.0, + "step": 18925 + }, + { + "epoch": 2.078409839666154, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.355860948562622, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7166666388511658, + "num_tokens": 489732389.0, + "step": 18926 + }, + { + "epoch": 2.078519657368768, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5179595947265625, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6882065534591675, + "num_tokens": 489756253.0, + "step": 18927 + }, + { + "epoch": 2.0786294750713816, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.599693536758423, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7375574707984924, + "num_tokens": 489779374.0, + "step": 18928 + }, + { + "epoch": 2.078739292773995, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.651855707168579, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7085316181182861, + "num_tokens": 489802145.0, + "step": 18929 + }, + { + "epoch": 2.0788491104766087, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2424964904785156, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7025280594825745, + "num_tokens": 489832481.0, + "step": 18930 + }, + { + "epoch": 2.0789589281792225, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5809829235076904, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7067615985870361, + "num_tokens": 489855325.0, + "step": 18931 + }, + { + "epoch": 2.079068745881836, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4572267532348633, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7138334512710571, + "num_tokens": 489880338.0, + "step": 18932 + }, + { + "epoch": 2.07917856358445, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5324013233184814, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7247593998908997, + "num_tokens": 489904091.0, + "step": 18933 + }, + { + "epoch": 2.0792883812870633, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.592644691467285, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7116899490356445, + "num_tokens": 489928676.0, + "step": 18934 + }, + { + "epoch": 2.079398198989677, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2711760997772217, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7132796049118042, + "num_tokens": 489956846.0, + "step": 18935 + }, + { + "epoch": 2.079508016692291, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.592261552810669, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7122156620025635, + "num_tokens": 489981568.0, + "step": 18936 + }, + { + "epoch": 2.0796178343949046, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.45638370513916, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7291029691696167, + "num_tokens": 490007835.0, + "step": 18937 + }, + { + "epoch": 2.0797276520975183, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7140042781829834, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7698756456375122, + "num_tokens": 490027937.0, + "step": 18938 + }, + { + "epoch": 2.0798374698001316, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1069562435150146, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.7045009136199951, + "num_tokens": 490063001.0, + "step": 18939 + }, + { + "epoch": 2.0799472875027454, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.458502769470215, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7298582792282104, + "num_tokens": 490088863.0, + "step": 18940 + }, + { + "epoch": 2.080057105205359, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.58642840385437, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7467988729476929, + "num_tokens": 490110820.0, + "step": 18941 + }, + { + "epoch": 2.080166922907973, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2065958976745605, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.707330048084259, + "num_tokens": 490140847.0, + "step": 18942 + }, + { + "epoch": 2.0802767406105866, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.412688970565796, + "learning_rate": 1e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.6905521154403687, + "num_tokens": 490166679.0, + "step": 18943 + }, + { + "epoch": 2.0803865583132, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.752837657928467, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7262916564941406, + "num_tokens": 490187894.0, + "step": 18944 + }, + { + "epoch": 2.0804963760158137, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.064222574234009, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7126396894454956, + "num_tokens": 490221796.0, + "step": 18945 + }, + { + "epoch": 2.0806061937184275, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.529339075088501, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7145444750785828, + "num_tokens": 490243502.0, + "step": 18946 + }, + { + "epoch": 2.0807160114210412, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.276170492172241, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7288202047348022, + "num_tokens": 490271915.0, + "step": 18947 + }, + { + "epoch": 2.0808258291236545, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.341944932937622, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7184323072433472, + "num_tokens": 490300156.0, + "step": 18948 + }, + { + "epoch": 2.0809356468262683, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.33794903755188, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6854719519615173, + "num_tokens": 490330308.0, + "step": 18949 + }, + { + "epoch": 2.081045464528882, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.274068832397461, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6988816261291504, + "num_tokens": 490358907.0, + "step": 18950 + }, + { + "epoch": 2.081155282231496, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.514342784881592, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7069704532623291, + "num_tokens": 490383541.0, + "step": 18951 + }, + { + "epoch": 2.0812650999341096, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3999598026275635, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7224493026733398, + "num_tokens": 490409125.0, + "step": 18952 + }, + { + "epoch": 2.081374917636723, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5480167865753174, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7253092527389526, + "num_tokens": 490432306.0, + "step": 18953 + }, + { + "epoch": 2.0814847353393366, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.676034927368164, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7082706093788147, + "num_tokens": 490454009.0, + "step": 18954 + }, + { + "epoch": 2.0815945530419504, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.374077081680298, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7047544717788696, + "num_tokens": 490482565.0, + "step": 18955 + }, + { + "epoch": 2.081704370744564, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.219675064086914, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6922746896743774, + "num_tokens": 490519834.0, + "step": 18956 + }, + { + "epoch": 2.0818141884471775, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.423037052154541, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7410352230072021, + "num_tokens": 490546074.0, + "step": 18957 + }, + { + "epoch": 2.0819240061497912, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4940762519836426, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7435341477394104, + "num_tokens": 490568911.0, + "step": 18958 + }, + { + "epoch": 2.082033823852405, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.437422752380371, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7005255222320557, + "num_tokens": 490594591.0, + "step": 18959 + }, + { + "epoch": 2.0821436415550187, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5104525089263916, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7190317511558533, + "num_tokens": 490619546.0, + "step": 18960 + }, + { + "epoch": 2.0822534592576325, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4656105041503906, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7185075283050537, + "num_tokens": 490648240.0, + "step": 18961 + }, + { + "epoch": 2.082363276960246, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4796688556671143, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7137284278869629, + "num_tokens": 490672451.0, + "step": 18962 + }, + { + "epoch": 2.0824730946628596, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.48677396774292, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.728607177734375, + "num_tokens": 490698015.0, + "step": 18963 + }, + { + "epoch": 2.0825829123654733, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4863810539245605, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7334052324295044, + "num_tokens": 490721407.0, + "step": 18964 + }, + { + "epoch": 2.082692730068087, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.60305118560791, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7398000359535217, + "num_tokens": 490743600.0, + "step": 18965 + }, + { + "epoch": 2.082802547770701, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5147135257720947, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7165700197219849, + "num_tokens": 490766922.0, + "step": 18966 + }, + { + "epoch": 2.082912365473314, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7491931915283203, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7317367792129517, + "num_tokens": 490787071.0, + "step": 18967 + }, + { + "epoch": 2.083022183175928, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.0754034519195557, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.718235194683075, + "num_tokens": 490818928.0, + "step": 18968 + }, + { + "epoch": 2.0831320008785417, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3516011238098145, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7066723108291626, + "num_tokens": 490845146.0, + "step": 18969 + }, + { + "epoch": 2.0832418185811554, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.609748125076294, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7124342918395996, + "num_tokens": 490870347.0, + "step": 18970 + }, + { + "epoch": 2.0833516362837687, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4209141731262207, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7106261849403381, + "num_tokens": 490897708.0, + "step": 18971 + }, + { + "epoch": 2.0834614539863825, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3431859016418457, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7052539587020874, + "num_tokens": 490926444.0, + "step": 18972 + }, + { + "epoch": 2.0835712716889963, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.582819700241089, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7308098077774048, + "num_tokens": 490951897.0, + "step": 18973 + }, + { + "epoch": 2.08368108939161, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1281096935272217, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6945662498474121, + "num_tokens": 490987745.0, + "step": 18974 + }, + { + "epoch": 2.0837909070942238, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.182628870010376, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7000936269760132, + "num_tokens": 491018623.0, + "step": 18975 + }, + { + "epoch": 2.083900724796837, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.505322217941284, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.719818115234375, + "num_tokens": 491042741.0, + "step": 18976 + }, + { + "epoch": 2.084010542499451, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3943912982940674, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7163447141647339, + "num_tokens": 491068750.0, + "step": 18977 + }, + { + "epoch": 2.0841203602020646, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5185062885284424, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6990078091621399, + "num_tokens": 491092239.0, + "step": 18978 + }, + { + "epoch": 2.0842301779046783, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.647146701812744, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7321539521217346, + "num_tokens": 491114475.0, + "step": 18979 + }, + { + "epoch": 2.084339995607292, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7902817726135254, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7297539710998535, + "num_tokens": 491134611.0, + "step": 18980 + }, + { + "epoch": 2.0844498133099054, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.707719326019287, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7200241684913635, + "num_tokens": 491155132.0, + "step": 18981 + }, + { + "epoch": 2.084559631012519, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6089110374450684, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.728655219078064, + "num_tokens": 491178243.0, + "step": 18982 + }, + { + "epoch": 2.084669448715133, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3334009647369385, + "learning_rate": 1e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6940211653709412, + "num_tokens": 491209504.0, + "step": 18983 + }, + { + "epoch": 2.0847792664177467, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6656007766723633, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7144869565963745, + "num_tokens": 491234330.0, + "step": 18984 + }, + { + "epoch": 2.08488908412036, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3552775382995605, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7168769240379333, + "num_tokens": 491262831.0, + "step": 18985 + }, + { + "epoch": 2.0849989018229738, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.415299892425537, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6916000843048096, + "num_tokens": 491290913.0, + "step": 18986 + }, + { + "epoch": 2.0851087195255875, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.640638828277588, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6996017694473267, + "num_tokens": 491314614.0, + "step": 18987 + }, + { + "epoch": 2.0852185372282013, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3520753383636475, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7003046274185181, + "num_tokens": 491344133.0, + "step": 18988 + }, + { + "epoch": 2.085328354930815, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.459296464920044, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.714665412902832, + "num_tokens": 491370174.0, + "step": 18989 + }, + { + "epoch": 2.0854381726334283, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.46907114982605, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7210660576820374, + "num_tokens": 491396116.0, + "step": 18990 + }, + { + "epoch": 2.085547990336042, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.227029800415039, + "learning_rate": 1e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6891685128211975, + "num_tokens": 491430346.0, + "step": 18991 + }, + { + "epoch": 2.085657808038656, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3236067295074463, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7160466909408569, + "num_tokens": 491458979.0, + "step": 18992 + }, + { + "epoch": 2.0857676257412696, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.480379819869995, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.730034351348877, + "num_tokens": 491484176.0, + "step": 18993 + }, + { + "epoch": 2.0858774434438834, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4506125450134277, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7146027088165283, + "num_tokens": 491509964.0, + "step": 18994 + }, + { + "epoch": 2.0859872611464967, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.701385736465454, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7159103155136108, + "num_tokens": 491531663.0, + "step": 18995 + }, + { + "epoch": 2.0860970788491104, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7439827919006348, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7049489617347717, + "num_tokens": 491554738.0, + "step": 18996 + }, + { + "epoch": 2.086206896551724, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6064839363098145, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7251514196395874, + "num_tokens": 491578738.0, + "step": 18997 + }, + { + "epoch": 2.086316714254338, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.365906238555908, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6987890005111694, + "num_tokens": 491606169.0, + "step": 18998 + }, + { + "epoch": 2.0864265319569513, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.404918670654297, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7272562980651855, + "num_tokens": 491634519.0, + "step": 18999 + }, + { + "epoch": 2.086536349659565, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4886045455932617, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6915517449378967, + "num_tokens": 491664802.0, + "step": 19000 + }, + { + "epoch": 2.086646167362179, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.0474095344543457, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.728263258934021, + "num_tokens": 491699769.0, + "step": 19001 + }, + { + "epoch": 2.0867559850647925, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4649550914764404, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7149426937103271, + "num_tokens": 491726122.0, + "step": 19002 + }, + { + "epoch": 2.0868658027674063, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.482454538345337, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7243029475212097, + "num_tokens": 491751062.0, + "step": 19003 + }, + { + "epoch": 2.0869756204700196, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.438333749771118, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7213238477706909, + "num_tokens": 491777880.0, + "step": 19004 + }, + { + "epoch": 2.0870854381726334, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.419159173965454, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7154161930084229, + "num_tokens": 491803602.0, + "step": 19005 + }, + { + "epoch": 2.087195255875247, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.285593032836914, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7158187627792358, + "num_tokens": 491833549.0, + "step": 19006 + }, + { + "epoch": 2.087305073577861, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.726215124130249, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.693649411201477, + "num_tokens": 491855771.0, + "step": 19007 + }, + { + "epoch": 2.087414891280474, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3134827613830566, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7156496644020081, + "num_tokens": 491883597.0, + "step": 19008 + }, + { + "epoch": 2.087524708983088, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.224705934524536, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7041206359863281, + "num_tokens": 491912746.0, + "step": 19009 + }, + { + "epoch": 2.0876345266857017, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.51804256439209, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7209537029266357, + "num_tokens": 491937029.0, + "step": 19010 + }, + { + "epoch": 2.0877443443883155, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6938583850860596, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7281550168991089, + "num_tokens": 491959920.0, + "step": 19011 + }, + { + "epoch": 2.087854162090929, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3918769359588623, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7293073534965515, + "num_tokens": 491989044.0, + "step": 19012 + }, + { + "epoch": 2.0879639797935425, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4955053329467773, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7192087173461914, + "num_tokens": 492013024.0, + "step": 19013 + }, + { + "epoch": 2.0880737974961563, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4175565242767334, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7119526863098145, + "num_tokens": 492044906.0, + "step": 19014 + }, + { + "epoch": 2.08818361519877, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.483515501022339, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7170835137367249, + "num_tokens": 492070927.0, + "step": 19015 + }, + { + "epoch": 2.088293432901384, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5908803939819336, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7099512219429016, + "num_tokens": 492095050.0, + "step": 19016 + }, + { + "epoch": 2.0884032506039976, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.745253801345825, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.719316840171814, + "num_tokens": 492115775.0, + "step": 19017 + }, + { + "epoch": 2.088513068306611, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.630188226699829, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7122929692268372, + "num_tokens": 492139040.0, + "step": 19018 + }, + { + "epoch": 2.0886228860092246, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.431185245513916, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7073668837547302, + "num_tokens": 492167280.0, + "step": 19019 + }, + { + "epoch": 2.0887327037118384, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5365898609161377, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7083300352096558, + "num_tokens": 492193315.0, + "step": 19020 + }, + { + "epoch": 2.088842521414452, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5715911388397217, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7109474539756775, + "num_tokens": 492217710.0, + "step": 19021 + }, + { + "epoch": 2.0889523391170655, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7872369289398193, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7080948352813721, + "num_tokens": 492238561.0, + "step": 19022 + }, + { + "epoch": 2.089062156819679, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.596029758453369, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7146833539009094, + "num_tokens": 492261352.0, + "step": 19023 + }, + { + "epoch": 2.089171974522293, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4217212200164795, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7129133939743042, + "num_tokens": 492286699.0, + "step": 19024 + }, + { + "epoch": 2.0892817922249067, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5527937412261963, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7284702062606812, + "num_tokens": 492309374.0, + "step": 19025 + }, + { + "epoch": 2.0893916099275205, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.0990800857543945, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7066818475723267, + "num_tokens": 492342546.0, + "step": 19026 + }, + { + "epoch": 2.089501427630134, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5573387145996094, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7360996603965759, + "num_tokens": 492369335.0, + "step": 19027 + }, + { + "epoch": 2.0896112453327476, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.589705467224121, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7114116549491882, + "num_tokens": 492393485.0, + "step": 19028 + }, + { + "epoch": 2.0897210630353613, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6973066329956055, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7153289318084717, + "num_tokens": 492417059.0, + "step": 19029 + }, + { + "epoch": 2.089830880737975, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.849787473678589, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7157899141311646, + "num_tokens": 492438264.0, + "step": 19030 + }, + { + "epoch": 2.089940698440589, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5150763988494873, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.7003933787345886, + "num_tokens": 492463019.0, + "step": 19031 + }, + { + "epoch": 2.090050516143202, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4202821254730225, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7139225006103516, + "num_tokens": 492489882.0, + "step": 19032 + }, + { + "epoch": 2.090160333845816, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5521631240844727, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7287859916687012, + "num_tokens": 492514387.0, + "step": 19033 + }, + { + "epoch": 2.0902701515484297, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.686382532119751, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7344979643821716, + "num_tokens": 492535709.0, + "step": 19034 + }, + { + "epoch": 2.0903799692510434, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5880277156829834, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7161778211593628, + "num_tokens": 492558357.0, + "step": 19035 + }, + { + "epoch": 2.0904897869536567, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.424574613571167, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7066073417663574, + "num_tokens": 492585376.0, + "step": 19036 + }, + { + "epoch": 2.0905996046562705, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3030762672424316, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6971095204353333, + "num_tokens": 492614595.0, + "step": 19037 + }, + { + "epoch": 2.0907094223588842, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.921942710876465, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7298852205276489, + "num_tokens": 492632727.0, + "step": 19038 + }, + { + "epoch": 2.090819240061498, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2948014736175537, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7021594643592834, + "num_tokens": 492662994.0, + "step": 19039 + }, + { + "epoch": 2.0909290577641118, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3617284297943115, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7156285047531128, + "num_tokens": 492690017.0, + "step": 19040 + }, + { + "epoch": 2.091038875466725, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3434319496154785, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7174145579338074, + "num_tokens": 492718107.0, + "step": 19041 + }, + { + "epoch": 2.091148693169339, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.697934865951538, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7187288403511047, + "num_tokens": 492741305.0, + "step": 19042 + }, + { + "epoch": 2.0912585108719526, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5056545734405518, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7143727540969849, + "num_tokens": 492766787.0, + "step": 19043 + }, + { + "epoch": 2.0913683285745663, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5049235820770264, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7180043458938599, + "num_tokens": 492793789.0, + "step": 19044 + }, + { + "epoch": 2.09147814627718, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3447134494781494, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7283426523208618, + "num_tokens": 492820917.0, + "step": 19045 + }, + { + "epoch": 2.0915879639797934, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4967682361602783, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.697464108467102, + "num_tokens": 492849776.0, + "step": 19046 + }, + { + "epoch": 2.091697781682407, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3068182468414307, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7294340133666992, + "num_tokens": 492877582.0, + "step": 19047 + }, + { + "epoch": 2.091807599385021, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.519859552383423, + "learning_rate": 1e-06, + "loss": 1.087, + "mean_token_accuracy": 0.6905219554901123, + "num_tokens": 492906048.0, + "step": 19048 + }, + { + "epoch": 2.0919174170876347, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5560595989227295, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7119616270065308, + "num_tokens": 492928817.0, + "step": 19049 + }, + { + "epoch": 2.092027234790248, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6093387603759766, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7228316068649292, + "num_tokens": 492951138.0, + "step": 19050 + }, + { + "epoch": 2.0921370524928617, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1488423347473145, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7248371243476868, + "num_tokens": 492985746.0, + "step": 19051 + }, + { + "epoch": 2.0922468701954755, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5425689220428467, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7223666906356812, + "num_tokens": 493008945.0, + "step": 19052 + }, + { + "epoch": 2.0923566878980893, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5287818908691406, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7232123613357544, + "num_tokens": 493032172.0, + "step": 19053 + }, + { + "epoch": 2.092466505600703, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1069390773773193, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7038663625717163, + "num_tokens": 493064895.0, + "step": 19054 + }, + { + "epoch": 2.0925763233033163, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3080501556396484, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6862640380859375, + "num_tokens": 493094111.0, + "step": 19055 + }, + { + "epoch": 2.09268614100593, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.308387279510498, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7217657566070557, + "num_tokens": 493120831.0, + "step": 19056 + }, + { + "epoch": 2.092795958708544, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.351214647293091, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7185409069061279, + "num_tokens": 493148749.0, + "step": 19057 + }, + { + "epoch": 2.0929057764111576, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1359896659851074, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7079695463180542, + "num_tokens": 493187015.0, + "step": 19058 + }, + { + "epoch": 2.093015594113771, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6558165550231934, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7082692384719849, + "num_tokens": 493211688.0, + "step": 19059 + }, + { + "epoch": 2.0931254118163847, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4601869583129883, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7215074300765991, + "num_tokens": 493238035.0, + "step": 19060 + }, + { + "epoch": 2.0932352295189984, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3844151496887207, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7322339415550232, + "num_tokens": 493265714.0, + "step": 19061 + }, + { + "epoch": 2.093345047221612, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.257261276245117, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.735968291759491, + "num_tokens": 493294297.0, + "step": 19062 + }, + { + "epoch": 2.093454864924226, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.594237804412842, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7385169863700867, + "num_tokens": 493315677.0, + "step": 19063 + }, + { + "epoch": 2.0935646826268393, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.487947940826416, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6990566253662109, + "num_tokens": 493342213.0, + "step": 19064 + }, + { + "epoch": 2.093674500329453, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5779192447662354, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7154654264450073, + "num_tokens": 493365307.0, + "step": 19065 + }, + { + "epoch": 2.0937843180320668, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1688365936279297, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7041245698928833, + "num_tokens": 493396239.0, + "step": 19066 + }, + { + "epoch": 2.0938941357346805, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.927398920059204, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7164355516433716, + "num_tokens": 493416150.0, + "step": 19067 + }, + { + "epoch": 2.0940039534372943, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.5175528526306152, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6944637894630432, + "num_tokens": 493440906.0, + "step": 19068 + }, + { + "epoch": 2.0941137711399076, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.3861825466156006, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7312771677970886, + "num_tokens": 493467869.0, + "step": 19069 + }, + { + "epoch": 2.0942235888425214, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.3649978637695312, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7090889811515808, + "num_tokens": 493496899.0, + "step": 19070 + }, + { + "epoch": 2.094333406545135, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.7015819549560547, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.728556752204895, + "num_tokens": 493518423.0, + "step": 19071 + }, + { + "epoch": 2.094443224247749, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.5254111289978027, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7159345746040344, + "num_tokens": 493543772.0, + "step": 19072 + }, + { + "epoch": 2.0945530419503626, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4446346759796143, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7281900644302368, + "num_tokens": 493569302.0, + "step": 19073 + }, + { + "epoch": 2.094662859652976, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.771228790283203, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7204766273498535, + "num_tokens": 493590965.0, + "step": 19074 + }, + { + "epoch": 2.0947726773555897, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4716033935546875, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6972147226333618, + "num_tokens": 493618059.0, + "step": 19075 + }, + { + "epoch": 2.0948824950582035, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.378304958343506, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7225258350372314, + "num_tokens": 493644737.0, + "step": 19076 + }, + { + "epoch": 2.094992312760817, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4165596961975098, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.7047144174575806, + "num_tokens": 493670418.0, + "step": 19077 + }, + { + "epoch": 2.0951021304634305, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.155168056488037, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7281717658042908, + "num_tokens": 493700392.0, + "step": 19078 + }, + { + "epoch": 2.0952119481660443, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.288478374481201, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7311306595802307, + "num_tokens": 493727488.0, + "step": 19079 + }, + { + "epoch": 2.095321765868658, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4760184288024902, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.736402153968811, + "num_tokens": 493750634.0, + "step": 19080 + }, + { + "epoch": 2.095431583571272, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 3.1330204010009766, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7373101115226746, + "num_tokens": 493766122.0, + "step": 19081 + }, + { + "epoch": 2.0955414012738856, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3958969116210938, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6909197568893433, + "num_tokens": 493792108.0, + "step": 19082 + }, + { + "epoch": 2.095651218976499, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.355696201324463, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7165766358375549, + "num_tokens": 493818761.0, + "step": 19083 + }, + { + "epoch": 2.0957610366791126, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.635021686553955, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.724668562412262, + "num_tokens": 493842579.0, + "step": 19084 + }, + { + "epoch": 2.0958708543817264, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5850679874420166, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7080867290496826, + "num_tokens": 493866585.0, + "step": 19085 + }, + { + "epoch": 2.09598067208434, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4846956729888916, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7146915793418884, + "num_tokens": 493891549.0, + "step": 19086 + }, + { + "epoch": 2.0960904897869534, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5772078037261963, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6975671052932739, + "num_tokens": 493917048.0, + "step": 19087 + }, + { + "epoch": 2.096200307489567, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.8157782554626465, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7148699760437012, + "num_tokens": 493936071.0, + "step": 19088 + }, + { + "epoch": 2.096310125192181, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.709045886993408, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7178810834884644, + "num_tokens": 493957816.0, + "step": 19089 + }, + { + "epoch": 2.0964199428947947, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.549370050430298, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7014332413673401, + "num_tokens": 493983395.0, + "step": 19090 + }, + { + "epoch": 2.0965297605974085, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.36669921875, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6970726251602173, + "num_tokens": 494012211.0, + "step": 19091 + }, + { + "epoch": 2.096639578300022, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.413557767868042, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7044377326965332, + "num_tokens": 494038658.0, + "step": 19092 + }, + { + "epoch": 2.0967493960026355, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4469969272613525, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7152309417724609, + "num_tokens": 494064052.0, + "step": 19093 + }, + { + "epoch": 2.0968592137052493, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.467244863510132, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7198175191879272, + "num_tokens": 494089375.0, + "step": 19094 + }, + { + "epoch": 2.096969031407863, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.314042568206787, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7141657471656799, + "num_tokens": 494118483.0, + "step": 19095 + }, + { + "epoch": 2.097078849110477, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5825014114379883, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.696314811706543, + "num_tokens": 494141958.0, + "step": 19096 + }, + { + "epoch": 2.09718866681309, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.272859811782837, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7112517952919006, + "num_tokens": 494170703.0, + "step": 19097 + }, + { + "epoch": 2.097298484515704, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5857977867126465, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7069675326347351, + "num_tokens": 494194926.0, + "step": 19098 + }, + { + "epoch": 2.0974083022183176, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4457223415374756, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7377161383628845, + "num_tokens": 494220067.0, + "step": 19099 + }, + { + "epoch": 2.0975181199209314, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2662782669067383, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7292291522026062, + "num_tokens": 494246743.0, + "step": 19100 + }, + { + "epoch": 2.0976279376235447, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.709747552871704, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7046281695365906, + "num_tokens": 494267627.0, + "step": 19101 + }, + { + "epoch": 2.0977377553261585, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.092421531677246, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7084523439407349, + "num_tokens": 494298287.0, + "step": 19102 + }, + { + "epoch": 2.0978475730287722, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.460240125656128, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7135148048400879, + "num_tokens": 494323848.0, + "step": 19103 + }, + { + "epoch": 2.097957390731386, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.503460168838501, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7239672541618347, + "num_tokens": 494348007.0, + "step": 19104 + }, + { + "epoch": 2.0980672084339997, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5969738960266113, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7163546085357666, + "num_tokens": 494372682.0, + "step": 19105 + }, + { + "epoch": 2.098177026136613, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5075812339782715, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7308880686759949, + "num_tokens": 494398639.0, + "step": 19106 + }, + { + "epoch": 2.098286843839227, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.776916027069092, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7124766707420349, + "num_tokens": 494420579.0, + "step": 19107 + }, + { + "epoch": 2.0983966615418406, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.639758825302124, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7103085517883301, + "num_tokens": 494444564.0, + "step": 19108 + }, + { + "epoch": 2.0985064792444543, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5274643898010254, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7107860445976257, + "num_tokens": 494468730.0, + "step": 19109 + }, + { + "epoch": 2.0986162969470676, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5510382652282715, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7099788188934326, + "num_tokens": 494494472.0, + "step": 19110 + }, + { + "epoch": 2.0987261146496814, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3291897773742676, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7011781930923462, + "num_tokens": 494523174.0, + "step": 19111 + }, + { + "epoch": 2.098835932352295, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6088321208953857, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7184680104255676, + "num_tokens": 494545274.0, + "step": 19112 + }, + { + "epoch": 2.098945750054909, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6266353130340576, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7209840416908264, + "num_tokens": 494566831.0, + "step": 19113 + }, + { + "epoch": 2.0990555677575227, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2039246559143066, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7087433338165283, + "num_tokens": 494598634.0, + "step": 19114 + }, + { + "epoch": 2.099165385460136, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4591152667999268, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7063291072845459, + "num_tokens": 494624358.0, + "step": 19115 + }, + { + "epoch": 2.0992752031627497, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.333662986755371, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7180605530738831, + "num_tokens": 494652414.0, + "step": 19116 + }, + { + "epoch": 2.0993850208653635, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4363837242126465, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7371058464050293, + "num_tokens": 494677253.0, + "step": 19117 + }, + { + "epoch": 2.0994948385679773, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.755018472671509, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7318474054336548, + "num_tokens": 494697568.0, + "step": 19118 + }, + { + "epoch": 2.099604656270591, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5106985569000244, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7147613167762756, + "num_tokens": 494723570.0, + "step": 19119 + }, + { + "epoch": 2.0997144739732043, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.709022045135498, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7235007286071777, + "num_tokens": 494744675.0, + "step": 19120 + }, + { + "epoch": 2.099824291675818, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.275861978530884, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7167655229568481, + "num_tokens": 494771771.0, + "step": 19121 + }, + { + "epoch": 2.099934109378432, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 7.170312404632568, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7100114226341248, + "num_tokens": 494796506.0, + "step": 19122 + }, + { + "epoch": 2.1000439270810456, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1575446128845215, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.72374027967453, + "num_tokens": 494829970.0, + "step": 19123 + }, + { + "epoch": 2.1001537447836593, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.469308614730835, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7051905393600464, + "num_tokens": 494856312.0, + "step": 19124 + }, + { + "epoch": 2.1002635624862727, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2189173698425293, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7266099452972412, + "num_tokens": 494887483.0, + "step": 19125 + }, + { + "epoch": 2.1003733801888864, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.52447509765625, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7224572896957397, + "num_tokens": 494910484.0, + "step": 19126 + }, + { + "epoch": 2.1004831978915, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2340939044952393, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7338708639144897, + "num_tokens": 494942311.0, + "step": 19127 + }, + { + "epoch": 2.100593015594114, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.710315704345703, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7426758408546448, + "num_tokens": 494963926.0, + "step": 19128 + }, + { + "epoch": 2.1007028332967272, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 32.074256896972656, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7497873306274414, + "num_tokens": 494992019.0, + "step": 19129 + }, + { + "epoch": 2.100812650999341, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.12756085395813, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6949843764305115, + "num_tokens": 495028204.0, + "step": 19130 + }, + { + "epoch": 2.1009224687019548, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7391538619995117, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7246804237365723, + "num_tokens": 495048069.0, + "step": 19131 + }, + { + "epoch": 2.1010322864045685, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.346054792404175, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7373877167701721, + "num_tokens": 495075347.0, + "step": 19132 + }, + { + "epoch": 2.1011421041071823, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5586938858032227, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7403448820114136, + "num_tokens": 495099759.0, + "step": 19133 + }, + { + "epoch": 2.1012519218097956, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.31111741065979, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7152696847915649, + "num_tokens": 495128353.0, + "step": 19134 + }, + { + "epoch": 2.1013617395124093, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.8396334648132324, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7322038412094116, + "num_tokens": 495148321.0, + "step": 19135 + }, + { + "epoch": 2.101471557215023, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4732909202575684, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.712775707244873, + "num_tokens": 495173397.0, + "step": 19136 + }, + { + "epoch": 2.101581374917637, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4863154888153076, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7023495435714722, + "num_tokens": 495197877.0, + "step": 19137 + }, + { + "epoch": 2.10169119262025, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2117040157318115, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.712590217590332, + "num_tokens": 495227587.0, + "step": 19138 + }, + { + "epoch": 2.101801010322864, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 3.867271661758423, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7079620361328125, + "num_tokens": 495253735.0, + "step": 19139 + }, + { + "epoch": 2.1019108280254777, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6144440174102783, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7208368182182312, + "num_tokens": 495275860.0, + "step": 19140 + }, + { + "epoch": 2.1020206457280914, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.713101863861084, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7407260537147522, + "num_tokens": 495295552.0, + "step": 19141 + }, + { + "epoch": 2.102130463430705, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.572796106338501, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.721849799156189, + "num_tokens": 495318799.0, + "step": 19142 + }, + { + "epoch": 2.1022402811333185, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6730735301971436, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7287951707839966, + "num_tokens": 495340919.0, + "step": 19143 + }, + { + "epoch": 2.1023500988359323, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3912792205810547, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7143609523773193, + "num_tokens": 495366914.0, + "step": 19144 + }, + { + "epoch": 2.102459916538546, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.9159445762634277, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7262392044067383, + "num_tokens": 495384879.0, + "step": 19145 + }, + { + "epoch": 2.10256973424116, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6727042198181152, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7091150283813477, + "num_tokens": 495407837.0, + "step": 19146 + }, + { + "epoch": 2.1026795519437735, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3503684997558594, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7273523807525635, + "num_tokens": 495434520.0, + "step": 19147 + }, + { + "epoch": 2.102789369646387, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.955113172531128, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7324452996253967, + "num_tokens": 495452239.0, + "step": 19148 + }, + { + "epoch": 2.1028991873490006, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3991475105285645, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7117420434951782, + "num_tokens": 495478247.0, + "step": 19149 + }, + { + "epoch": 2.1030090050516144, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2828004360198975, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.703018307685852, + "num_tokens": 495507686.0, + "step": 19150 + }, + { + "epoch": 2.103118822754228, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.331448554992676, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.708669900894165, + "num_tokens": 495542033.0, + "step": 19151 + }, + { + "epoch": 2.1032286404568414, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.328617811203003, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7380003929138184, + "num_tokens": 495568759.0, + "step": 19152 + }, + { + "epoch": 2.103338458159455, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.514943838119507, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7137266397476196, + "num_tokens": 495591660.0, + "step": 19153 + }, + { + "epoch": 2.103448275862069, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3205435276031494, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7283732295036316, + "num_tokens": 495618271.0, + "step": 19154 + }, + { + "epoch": 2.1035580935646827, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.627755641937256, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7016794681549072, + "num_tokens": 495642256.0, + "step": 19155 + }, + { + "epoch": 2.1036679112672965, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 4.0454325675964355, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7396930456161499, + "num_tokens": 495662454.0, + "step": 19156 + }, + { + "epoch": 2.1037777289699098, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3595190048217773, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7067165374755859, + "num_tokens": 495690054.0, + "step": 19157 + }, + { + "epoch": 2.1038875466725235, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5816919803619385, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7284238338470459, + "num_tokens": 495713834.0, + "step": 19158 + }, + { + "epoch": 2.1039973643751373, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3240408897399902, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7426391839981079, + "num_tokens": 495740116.0, + "step": 19159 + }, + { + "epoch": 2.104107182077751, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7020790576934814, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7204984426498413, + "num_tokens": 495762199.0, + "step": 19160 + }, + { + "epoch": 2.104216999780365, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.387482166290283, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7282216548919678, + "num_tokens": 495787374.0, + "step": 19161 + }, + { + "epoch": 2.104326817482978, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.157039165496826, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.710157036781311, + "num_tokens": 495820269.0, + "step": 19162 + }, + { + "epoch": 2.104436635185592, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3922743797302246, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7090762853622437, + "num_tokens": 495848752.0, + "step": 19163 + }, + { + "epoch": 2.1045464528882056, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.646989107131958, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7153926491737366, + "num_tokens": 495871746.0, + "step": 19164 + }, + { + "epoch": 2.1046562705908194, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5966637134552, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.705146074295044, + "num_tokens": 495894868.0, + "step": 19165 + }, + { + "epoch": 2.1047660882934327, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.268864631652832, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7024624943733215, + "num_tokens": 495923860.0, + "step": 19166 + }, + { + "epoch": 2.1048759059960465, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.0868420600891113, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7216160297393799, + "num_tokens": 495957197.0, + "step": 19167 + }, + { + "epoch": 2.10498572369866, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.288743734359741, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.703421950340271, + "num_tokens": 495985904.0, + "step": 19168 + }, + { + "epoch": 2.105095541401274, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2800943851470947, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.677741289138794, + "num_tokens": 496015293.0, + "step": 19169 + }, + { + "epoch": 2.1052053591038877, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.473388671875, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7119749784469604, + "num_tokens": 496042205.0, + "step": 19170 + }, + { + "epoch": 2.105315176806501, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.467477321624756, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7111189365386963, + "num_tokens": 496067972.0, + "step": 19171 + }, + { + "epoch": 2.105424994509115, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4272661209106445, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7341527938842773, + "num_tokens": 496092607.0, + "step": 19172 + }, + { + "epoch": 2.1055348122117286, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3184409141540527, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7148867845535278, + "num_tokens": 496122886.0, + "step": 19173 + }, + { + "epoch": 2.1056446299143423, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.6017017364501953, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7010774612426758, + "num_tokens": 496146924.0, + "step": 19174 + }, + { + "epoch": 2.105754447616956, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.7342896461486816, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7213455438613892, + "num_tokens": 496166899.0, + "step": 19175 + }, + { + "epoch": 2.1058642653195694, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3389713764190674, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7029860019683838, + "num_tokens": 496196377.0, + "step": 19176 + }, + { + "epoch": 2.105974083022183, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.445927858352661, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7118183970451355, + "num_tokens": 496223225.0, + "step": 19177 + }, + { + "epoch": 2.106083900724797, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4100890159606934, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6998120546340942, + "num_tokens": 496250112.0, + "step": 19178 + }, + { + "epoch": 2.1061937184274107, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2174184322357178, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7246285080909729, + "num_tokens": 496279922.0, + "step": 19179 + }, + { + "epoch": 2.106303536130024, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.592899799346924, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6972640752792358, + "num_tokens": 496304436.0, + "step": 19180 + }, + { + "epoch": 2.1064133538326377, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.342052936553955, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7043737769126892, + "num_tokens": 496334696.0, + "step": 19181 + }, + { + "epoch": 2.1065231715352515, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5541610717773438, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7211182117462158, + "num_tokens": 496358059.0, + "step": 19182 + }, + { + "epoch": 2.1066329892378652, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.162524461746216, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7120532989501953, + "num_tokens": 496387740.0, + "step": 19183 + }, + { + "epoch": 2.106742806940479, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.375436544418335, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7038944959640503, + "num_tokens": 496415509.0, + "step": 19184 + }, + { + "epoch": 2.1068526246430923, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.794755220413208, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7126539945602417, + "num_tokens": 496437011.0, + "step": 19185 + }, + { + "epoch": 2.106962442345706, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.1510446071624756, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7101187705993652, + "num_tokens": 496469315.0, + "step": 19186 + }, + { + "epoch": 2.10707226004832, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.599304676055908, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7075458765029907, + "num_tokens": 496496872.0, + "step": 19187 + }, + { + "epoch": 2.1071820777509336, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2574245929718018, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.725887656211853, + "num_tokens": 496526371.0, + "step": 19188 + }, + { + "epoch": 2.107291895453547, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.422623634338379, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7116661071777344, + "num_tokens": 496552103.0, + "step": 19189 + }, + { + "epoch": 2.1074017131561606, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5622646808624268, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7034703493118286, + "num_tokens": 496579357.0, + "step": 19190 + }, + { + "epoch": 2.1075115308587744, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.5280590057373047, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6972458958625793, + "num_tokens": 496605039.0, + "step": 19191 + }, + { + "epoch": 2.107621348561388, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.9706153869628906, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7432879209518433, + "num_tokens": 496621766.0, + "step": 19192 + }, + { + "epoch": 2.107731166264002, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3544795513153076, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.724805474281311, + "num_tokens": 496649137.0, + "step": 19193 + }, + { + "epoch": 2.1078409839666152, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4274957180023193, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7189209461212158, + "num_tokens": 496675600.0, + "step": 19194 + }, + { + "epoch": 2.107950801669229, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.3642337322235107, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7156473398208618, + "num_tokens": 496702492.0, + "step": 19195 + }, + { + "epoch": 2.1080606193718427, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.4300124645233154, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7297250628471375, + "num_tokens": 496729085.0, + "step": 19196 + }, + { + "epoch": 2.1081704370744565, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.473318338394165, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7136419415473938, + "num_tokens": 496752094.0, + "step": 19197 + }, + { + "epoch": 2.1082802547770703, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.2430200576782227, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6894403696060181, + "num_tokens": 496785434.0, + "step": 19198 + }, + { + "epoch": 2.1083900724796836, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.232480525970459, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.7041149139404297, + "num_tokens": 496814915.0, + "step": 19199 + }, + { + "epoch": 2.1084998901822973, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.2614059448242188, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7220711708068848, + "num_tokens": 496842686.0, + "step": 19200 + }, + { + "epoch": 2.108609707884911, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.741027355194092, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7162914276123047, + "num_tokens": 496863892.0, + "step": 19201 + }, + { + "epoch": 2.108719525587525, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.49997615814209, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.7191171646118164, + "num_tokens": 496886522.0, + "step": 19202 + }, + { + "epoch": 2.108829343290138, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 2.577138900756836, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7237724661827087, + "num_tokens": 496909244.0, + "step": 19203 + }, + { + "epoch": 2.108939160992752, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.5773940086364746, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7070214748382568, + "num_tokens": 496935758.0, + "step": 19204 + }, + { + "epoch": 2.1090489786953657, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.824828863143921, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7211318612098694, + "num_tokens": 496960666.0, + "step": 19205 + }, + { + "epoch": 2.1091587963979794, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.345698356628418, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.712881326675415, + "num_tokens": 496987818.0, + "step": 19206 + }, + { + "epoch": 2.109268614100593, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.487473964691162, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7164844870567322, + "num_tokens": 497013415.0, + "step": 19207 + }, + { + "epoch": 2.1093784318032065, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.7181904315948486, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7208998799324036, + "num_tokens": 497034957.0, + "step": 19208 + }, + { + "epoch": 2.1094882495058203, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.6179745197296143, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7263015508651733, + "num_tokens": 497057299.0, + "step": 19209 + }, + { + "epoch": 2.109598067208434, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 3.6024112701416016, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7067261934280396, + "num_tokens": 497083160.0, + "step": 19210 + }, + { + "epoch": 2.1097078849110478, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.4275872707366943, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.7112510800361633, + "num_tokens": 497110314.0, + "step": 19211 + }, + { + "epoch": 2.1098177026136615, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5395712852478027, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7350708246231079, + "num_tokens": 497133958.0, + "step": 19212 + }, + { + "epoch": 2.109927520316275, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.626413345336914, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7208825945854187, + "num_tokens": 497156841.0, + "step": 19213 + }, + { + "epoch": 2.1100373380188886, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.403738260269165, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7362179756164551, + "num_tokens": 497181270.0, + "step": 19214 + }, + { + "epoch": 2.1101471557215024, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4999892711639404, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7185319066047668, + "num_tokens": 497206080.0, + "step": 19215 + }, + { + "epoch": 2.110256973424116, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3021657466888428, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.7050960063934326, + "num_tokens": 497236514.0, + "step": 19216 + }, + { + "epoch": 2.1103667911267294, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2924089431762695, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7043562531471252, + "num_tokens": 497265054.0, + "step": 19217 + }, + { + "epoch": 2.110476608829343, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.7343602180480957, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7320517301559448, + "num_tokens": 497284782.0, + "step": 19218 + }, + { + "epoch": 2.110586426531957, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.234168767929077, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7155287265777588, + "num_tokens": 497315170.0, + "step": 19219 + }, + { + "epoch": 2.1106962442345707, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.51151442527771, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6999257802963257, + "num_tokens": 497342680.0, + "step": 19220 + }, + { + "epoch": 2.1108060619371845, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.7552218437194824, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7047198414802551, + "num_tokens": 497365246.0, + "step": 19221 + }, + { + "epoch": 2.1109158796397978, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6707494258880615, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7120206952095032, + "num_tokens": 497386614.0, + "step": 19222 + }, + { + "epoch": 2.1110256973424115, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6832172870635986, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7166147232055664, + "num_tokens": 497408631.0, + "step": 19223 + }, + { + "epoch": 2.1111355150450253, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3498618602752686, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7385568022727966, + "num_tokens": 497435262.0, + "step": 19224 + }, + { + "epoch": 2.111245332747639, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3626959323883057, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7216092348098755, + "num_tokens": 497462587.0, + "step": 19225 + }, + { + "epoch": 2.111355150450253, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5203537940979004, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.723203718662262, + "num_tokens": 497486300.0, + "step": 19226 + }, + { + "epoch": 2.111464968152866, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6802661418914795, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.726362943649292, + "num_tokens": 497507945.0, + "step": 19227 + }, + { + "epoch": 2.11157478585548, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.2752790451049805, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6935923099517822, + "num_tokens": 497540400.0, + "step": 19228 + }, + { + "epoch": 2.1116846035580936, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 2.6959595680236816, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7212682962417603, + "num_tokens": 497562841.0, + "step": 19229 + }, + { + "epoch": 2.1117944212607074, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.465428113937378, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7051814794540405, + "num_tokens": 497586063.0, + "step": 19230 + }, + { + "epoch": 2.1119042389633207, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.439826488494873, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.723721981048584, + "num_tokens": 497611524.0, + "step": 19231 + }, + { + "epoch": 2.1120140566659344, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4625418186187744, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7190065383911133, + "num_tokens": 497636834.0, + "step": 19232 + }, + { + "epoch": 2.112123874368548, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5408334732055664, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7208801507949829, + "num_tokens": 497662050.0, + "step": 19233 + }, + { + "epoch": 2.112233692071162, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.244147300720215, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7187488675117493, + "num_tokens": 497691458.0, + "step": 19234 + }, + { + "epoch": 2.1123435097737757, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.703484535217285, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7214458584785461, + "num_tokens": 497713683.0, + "step": 19235 + }, + { + "epoch": 2.112453327476389, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.450272798538208, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7171059846878052, + "num_tokens": 497740788.0, + "step": 19236 + }, + { + "epoch": 2.112563145179003, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3322808742523193, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7074877619743347, + "num_tokens": 497767673.0, + "step": 19237 + }, + { + "epoch": 2.1126729628816165, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.683121681213379, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7337372303009033, + "num_tokens": 497787761.0, + "step": 19238 + }, + { + "epoch": 2.1127827805842303, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3102362155914307, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7096776366233826, + "num_tokens": 497818542.0, + "step": 19239 + }, + { + "epoch": 2.1128925982868436, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4732446670532227, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7044798135757446, + "num_tokens": 497845783.0, + "step": 19240 + }, + { + "epoch": 2.1130024159894574, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2646093368530273, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7103886604309082, + "num_tokens": 497876838.0, + "step": 19241 + }, + { + "epoch": 2.113112233692071, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3297319412231445, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7043083906173706, + "num_tokens": 497906561.0, + "step": 19242 + }, + { + "epoch": 2.113222051394685, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.338135242462158, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6904669404029846, + "num_tokens": 497936046.0, + "step": 19243 + }, + { + "epoch": 2.1133318690972986, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.691481351852417, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7128503322601318, + "num_tokens": 497959831.0, + "step": 19244 + }, + { + "epoch": 2.113441686799912, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.375783920288086, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7181137204170227, + "num_tokens": 497986435.0, + "step": 19245 + }, + { + "epoch": 2.1135515045025257, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.265554666519165, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7076661586761475, + "num_tokens": 498016562.0, + "step": 19246 + }, + { + "epoch": 2.1136613222051395, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3717057704925537, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7258666753768921, + "num_tokens": 498042669.0, + "step": 19247 + }, + { + "epoch": 2.1137711399077532, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.8374478816986084, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.708997368812561, + "num_tokens": 498062798.0, + "step": 19248 + }, + { + "epoch": 2.113880957610367, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.381215810775757, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7060256004333496, + "num_tokens": 498090075.0, + "step": 19249 + }, + { + "epoch": 2.1139907753129803, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2387561798095703, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7072324156761169, + "num_tokens": 498119591.0, + "step": 19250 + }, + { + "epoch": 2.114100593015594, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.065293073654175, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7118844985961914, + "num_tokens": 498154592.0, + "step": 19251 + }, + { + "epoch": 2.114210410718208, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.1941847801208496, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7021784782409668, + "num_tokens": 498185943.0, + "step": 19252 + }, + { + "epoch": 2.1143202284208216, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.514688730239868, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6986428499221802, + "num_tokens": 498212773.0, + "step": 19253 + }, + { + "epoch": 2.1144300461234353, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2346580028533936, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.723877489566803, + "num_tokens": 498240260.0, + "step": 19254 + }, + { + "epoch": 2.1145398638260486, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.600228786468506, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7121760845184326, + "num_tokens": 498264336.0, + "step": 19255 + }, + { + "epoch": 2.1146496815286624, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.288292169570923, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7065313458442688, + "num_tokens": 498293781.0, + "step": 19256 + }, + { + "epoch": 2.114759499231276, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.428713083267212, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7088793516159058, + "num_tokens": 498322343.0, + "step": 19257 + }, + { + "epoch": 2.11486931693389, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2425758838653564, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7287337183952332, + "num_tokens": 498352780.0, + "step": 19258 + }, + { + "epoch": 2.114979134636503, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.374077081680298, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7238924503326416, + "num_tokens": 498377441.0, + "step": 19259 + }, + { + "epoch": 2.115088952339117, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.7733888626098633, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7127093076705933, + "num_tokens": 498397880.0, + "step": 19260 + }, + { + "epoch": 2.1151987700417307, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3309481143951416, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7036752700805664, + "num_tokens": 498426227.0, + "step": 19261 + }, + { + "epoch": 2.1153085877443445, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4376115798950195, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7072394490242004, + "num_tokens": 498453888.0, + "step": 19262 + }, + { + "epoch": 2.1154184054469582, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.441101312637329, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7143001556396484, + "num_tokens": 498482431.0, + "step": 19263 + }, + { + "epoch": 2.1155282231495716, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.1084322929382324, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7006030678749084, + "num_tokens": 498517225.0, + "step": 19264 + }, + { + "epoch": 2.1156380408521853, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4133124351501465, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7193934917449951, + "num_tokens": 498542827.0, + "step": 19265 + }, + { + "epoch": 2.115747858554799, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2598042488098145, + "learning_rate": 1e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6858391761779785, + "num_tokens": 498574854.0, + "step": 19266 + }, + { + "epoch": 2.115857676257413, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5606541633605957, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7103209495544434, + "num_tokens": 498600075.0, + "step": 19267 + }, + { + "epoch": 2.115967493960026, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2518115043640137, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.705654501914978, + "num_tokens": 498627775.0, + "step": 19268 + }, + { + "epoch": 2.11607731166264, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4047060012817383, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7084447741508484, + "num_tokens": 498652204.0, + "step": 19269 + }, + { + "epoch": 2.1161871293652537, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.7323684692382812, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7259291410446167, + "num_tokens": 498672745.0, + "step": 19270 + }, + { + "epoch": 2.1162969470678674, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6408419609069824, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7193999886512756, + "num_tokens": 498695867.0, + "step": 19271 + }, + { + "epoch": 2.116406764770481, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.843405246734619, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7172131538391113, + "num_tokens": 498715297.0, + "step": 19272 + }, + { + "epoch": 2.1165165824730945, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.546499252319336, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7156233787536621, + "num_tokens": 498739354.0, + "step": 19273 + }, + { + "epoch": 2.1166264001757082, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.8025174140930176, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7277451753616333, + "num_tokens": 498762108.0, + "step": 19274 + }, + { + "epoch": 2.116736217878322, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4142963886260986, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7197712659835815, + "num_tokens": 498789026.0, + "step": 19275 + }, + { + "epoch": 2.1168460355809358, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3811087608337402, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7109450101852417, + "num_tokens": 498814879.0, + "step": 19276 + }, + { + "epoch": 2.1169558532835495, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6073477268218994, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7252477407455444, + "num_tokens": 498836829.0, + "step": 19277 + }, + { + "epoch": 2.117065670986163, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5734574794769287, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7084524631500244, + "num_tokens": 498861658.0, + "step": 19278 + }, + { + "epoch": 2.1171754886887766, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.909925937652588, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7133405208587646, + "num_tokens": 498880637.0, + "step": 19279 + }, + { + "epoch": 2.1172853063913903, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.467815399169922, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7019263505935669, + "num_tokens": 498907284.0, + "step": 19280 + }, + { + "epoch": 2.117395124094004, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.357231378555298, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7145955562591553, + "num_tokens": 498936942.0, + "step": 19281 + }, + { + "epoch": 2.1175049417966174, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.1705806255340576, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7372514009475708, + "num_tokens": 498966621.0, + "step": 19282 + }, + { + "epoch": 2.117614759499231, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5427119731903076, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7223615646362305, + "num_tokens": 498990140.0, + "step": 19283 + }, + { + "epoch": 2.117724577201845, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.690577983856201, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7155466079711914, + "num_tokens": 499011949.0, + "step": 19284 + }, + { + "epoch": 2.1178343949044587, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3471732139587402, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7205749750137329, + "num_tokens": 499038072.0, + "step": 19285 + }, + { + "epoch": 2.1179442126070724, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.665693759918213, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7494540214538574, + "num_tokens": 499058058.0, + "step": 19286 + }, + { + "epoch": 2.1180540303096858, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2746856212615967, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.7019785642623901, + "num_tokens": 499090487.0, + "step": 19287 + }, + { + "epoch": 2.1181638480122995, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4842593669891357, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.703809916973114, + "num_tokens": 499117531.0, + "step": 19288 + }, + { + "epoch": 2.1182736657149133, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.323647975921631, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7279154062271118, + "num_tokens": 499146062.0, + "step": 19289 + }, + { + "epoch": 2.118383483417527, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.6485631465911865, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6950732469558716, + "num_tokens": 499178273.0, + "step": 19290 + }, + { + "epoch": 2.118493301120141, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4471120834350586, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7155252695083618, + "num_tokens": 499209831.0, + "step": 19291 + }, + { + "epoch": 2.118603118822754, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.379117727279663, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7090964317321777, + "num_tokens": 499236931.0, + "step": 19292 + }, + { + "epoch": 2.118712936525368, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.19197416305542, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7048582434654236, + "num_tokens": 499268058.0, + "step": 19293 + }, + { + "epoch": 2.1188227542279816, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4527981281280518, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7319304943084717, + "num_tokens": 499294168.0, + "step": 19294 + }, + { + "epoch": 2.1189325719305954, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4334616661071777, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7107855081558228, + "num_tokens": 499321024.0, + "step": 19295 + }, + { + "epoch": 2.1190423896332087, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.148857831954956, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7346271276473999, + "num_tokens": 499352640.0, + "step": 19296 + }, + { + "epoch": 2.1191522073358224, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.393202543258667, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7136696577072144, + "num_tokens": 499379641.0, + "step": 19297 + }, + { + "epoch": 2.119262025038436, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2141919136047363, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7056033611297607, + "num_tokens": 499411356.0, + "step": 19298 + }, + { + "epoch": 2.11937184274105, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.660715341567993, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7309086322784424, + "num_tokens": 499431949.0, + "step": 19299 + }, + { + "epoch": 2.1194816604436637, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.986726999282837, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7190279364585876, + "num_tokens": 499448969.0, + "step": 19300 + }, + { + "epoch": 2.119591478146277, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.534698247909546, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.715814471244812, + "num_tokens": 499474514.0, + "step": 19301 + }, + { + "epoch": 2.1197012958488908, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.542080879211426, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7238304615020752, + "num_tokens": 499497833.0, + "step": 19302 + }, + { + "epoch": 2.1198111135515045, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.5040183067321777, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7277915477752686, + "num_tokens": 499527147.0, + "step": 19303 + }, + { + "epoch": 2.1199209312541183, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.4383771419525146, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7053547501564026, + "num_tokens": 499553289.0, + "step": 19304 + }, + { + "epoch": 2.120030748956732, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.7035181522369385, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7418229579925537, + "num_tokens": 499575098.0, + "step": 19305 + }, + { + "epoch": 2.1201405666593454, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.3152341842651367, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6989636421203613, + "num_tokens": 499605573.0, + "step": 19306 + }, + { + "epoch": 2.120250384361959, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.593653917312622, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7183623313903809, + "num_tokens": 499630419.0, + "step": 19307 + }, + { + "epoch": 2.120360202064573, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4591474533081055, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7223768830299377, + "num_tokens": 499655863.0, + "step": 19308 + }, + { + "epoch": 2.1204700197671866, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.111971855163574, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6993894577026367, + "num_tokens": 499689025.0, + "step": 19309 + }, + { + "epoch": 2.1205798374698, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2977383136749268, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7225626111030579, + "num_tokens": 499717713.0, + "step": 19310 + }, + { + "epoch": 2.1206896551724137, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.690380334854126, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7353683114051819, + "num_tokens": 499739851.0, + "step": 19311 + }, + { + "epoch": 2.1207994728750275, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3297500610351562, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7205137014389038, + "num_tokens": 499770583.0, + "step": 19312 + }, + { + "epoch": 2.120909290577641, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4014906883239746, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7332013845443726, + "num_tokens": 499798119.0, + "step": 19313 + }, + { + "epoch": 2.121019108280255, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.258819818496704, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6882665753364563, + "num_tokens": 499828541.0, + "step": 19314 + }, + { + "epoch": 2.1211289259828683, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.621781587600708, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7385090589523315, + "num_tokens": 499849585.0, + "step": 19315 + }, + { + "epoch": 2.121238743685482, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5401313304901123, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7245312929153442, + "num_tokens": 499871411.0, + "step": 19316 + }, + { + "epoch": 2.121348561388096, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6079928874969482, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7158398032188416, + "num_tokens": 499895292.0, + "step": 19317 + }, + { + "epoch": 2.1214583790907096, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.349125862121582, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7045036554336548, + "num_tokens": 499925480.0, + "step": 19318 + }, + { + "epoch": 2.121568196793323, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6918394565582275, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.707916796207428, + "num_tokens": 499948452.0, + "step": 19319 + }, + { + "epoch": 2.1216780144959366, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.527784824371338, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7208182215690613, + "num_tokens": 499972771.0, + "step": 19320 + }, + { + "epoch": 2.1217878321985504, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.859325647354126, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7195847034454346, + "num_tokens": 499993166.0, + "step": 19321 + }, + { + "epoch": 2.121897649901164, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2490456104278564, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7284812927246094, + "num_tokens": 500024026.0, + "step": 19322 + }, + { + "epoch": 2.122007467603778, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3533756732940674, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7185304164886475, + "num_tokens": 500052901.0, + "step": 19323 + }, + { + "epoch": 2.122117285306391, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5006508827209473, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.711625874042511, + "num_tokens": 500076201.0, + "step": 19324 + }, + { + "epoch": 2.122227103009005, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3765387535095215, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6993470191955566, + "num_tokens": 500102261.0, + "step": 19325 + }, + { + "epoch": 2.1223369207116187, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.914982795715332, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7362248301506042, + "num_tokens": 500121653.0, + "step": 19326 + }, + { + "epoch": 2.1224467384142325, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3677971363067627, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7219678163528442, + "num_tokens": 500150513.0, + "step": 19327 + }, + { + "epoch": 2.1225565561168462, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.467766761779785, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7065464854240417, + "num_tokens": 500176494.0, + "step": 19328 + }, + { + "epoch": 2.1226663738194596, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.9656693935394287, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7342479825019836, + "num_tokens": 500197664.0, + "step": 19329 + }, + { + "epoch": 2.1227761915220733, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.391491413116455, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.7030653357505798, + "num_tokens": 500225618.0, + "step": 19330 + }, + { + "epoch": 2.122886009224687, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4752066135406494, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7175736427307129, + "num_tokens": 500250309.0, + "step": 19331 + }, + { + "epoch": 2.122995826927301, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.602431535720825, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7230046391487122, + "num_tokens": 500274450.0, + "step": 19332 + }, + { + "epoch": 2.123105644629914, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.307854175567627, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7086853981018066, + "num_tokens": 500305517.0, + "step": 19333 + }, + { + "epoch": 2.123215462332528, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5083553791046143, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7501066327095032, + "num_tokens": 500329715.0, + "step": 19334 + }, + { + "epoch": 2.1233252800351416, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6202213764190674, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7312159538269043, + "num_tokens": 500352933.0, + "step": 19335 + }, + { + "epoch": 2.1234350977377554, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6052088737487793, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7136016488075256, + "num_tokens": 500374961.0, + "step": 19336 + }, + { + "epoch": 2.123544915440369, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.311802625656128, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7222532629966736, + "num_tokens": 500404202.0, + "step": 19337 + }, + { + "epoch": 2.1236547331429825, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.356389045715332, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7177125215530396, + "num_tokens": 500431137.0, + "step": 19338 + }, + { + "epoch": 2.1237645508455962, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.455524444580078, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.728787899017334, + "num_tokens": 500456704.0, + "step": 19339 + }, + { + "epoch": 2.12387436854821, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.670182466506958, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7220220565795898, + "num_tokens": 500479314.0, + "step": 19340 + }, + { + "epoch": 2.1239841862508237, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.499755859375, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7126830816268921, + "num_tokens": 500502601.0, + "step": 19341 + }, + { + "epoch": 2.1240940039534375, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.650397539138794, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7065219879150391, + "num_tokens": 500525637.0, + "step": 19342 + }, + { + "epoch": 2.124203821656051, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3844053745269775, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.711648166179657, + "num_tokens": 500551665.0, + "step": 19343 + }, + { + "epoch": 2.1243136393586646, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4247326850891113, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7046139240264893, + "num_tokens": 500580210.0, + "step": 19344 + }, + { + "epoch": 2.1244234570612783, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.571161985397339, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7027512788772583, + "num_tokens": 500605288.0, + "step": 19345 + }, + { + "epoch": 2.124533274763892, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2214651107788086, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7053146362304688, + "num_tokens": 500634790.0, + "step": 19346 + }, + { + "epoch": 2.1246430924665054, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3980965614318848, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6924114227294922, + "num_tokens": 500661686.0, + "step": 19347 + }, + { + "epoch": 2.124752910169119, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7276015281677246, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7312531471252441, + "num_tokens": 500682856.0, + "step": 19348 + }, + { + "epoch": 2.124862727871733, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2206618785858154, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.70756995677948, + "num_tokens": 500713087.0, + "step": 19349 + }, + { + "epoch": 2.1249725455743467, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6504790782928467, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.718544602394104, + "num_tokens": 500736800.0, + "step": 19350 + }, + { + "epoch": 2.1250823632769604, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6403675079345703, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7283023595809937, + "num_tokens": 500760036.0, + "step": 19351 + }, + { + "epoch": 2.1251921809795737, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6180381774902344, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7177218198776245, + "num_tokens": 500786040.0, + "step": 19352 + }, + { + "epoch": 2.1253019986821875, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3154098987579346, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7204539179801941, + "num_tokens": 500813124.0, + "step": 19353 + }, + { + "epoch": 2.1254118163848013, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.45731520652771, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7239545583724976, + "num_tokens": 500838213.0, + "step": 19354 + }, + { + "epoch": 2.125521634087415, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.532045602798462, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7206158638000488, + "num_tokens": 500863071.0, + "step": 19355 + }, + { + "epoch": 2.1256314517900288, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.471160650253296, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7118074893951416, + "num_tokens": 500889884.0, + "step": 19356 + }, + { + "epoch": 2.125741269492642, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3233134746551514, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.734007716178894, + "num_tokens": 500917979.0, + "step": 19357 + }, + { + "epoch": 2.125851087195256, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4146180152893066, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7107474207878113, + "num_tokens": 500944795.0, + "step": 19358 + }, + { + "epoch": 2.1259609048978696, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.318321704864502, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7110589742660522, + "num_tokens": 500973310.0, + "step": 19359 + }, + { + "epoch": 2.1260707226004834, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3228342533111572, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7157363891601562, + "num_tokens": 501001309.0, + "step": 19360 + }, + { + "epoch": 2.1261805403030967, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.329130172729492, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6958848834037781, + "num_tokens": 501031692.0, + "step": 19361 + }, + { + "epoch": 2.1262903580057104, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.625093936920166, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7142555117607117, + "num_tokens": 501055722.0, + "step": 19362 + }, + { + "epoch": 2.126400175708324, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.517472982406616, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7213367223739624, + "num_tokens": 501079954.0, + "step": 19363 + }, + { + "epoch": 2.126509993410938, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5595812797546387, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7255539894104004, + "num_tokens": 501102979.0, + "step": 19364 + }, + { + "epoch": 2.1266198111135517, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3373448848724365, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7195431590080261, + "num_tokens": 501128962.0, + "step": 19365 + }, + { + "epoch": 2.126729628816165, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4761781692504883, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7213314771652222, + "num_tokens": 501153358.0, + "step": 19366 + }, + { + "epoch": 2.1268394465187788, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2686264514923096, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7173256874084473, + "num_tokens": 501183138.0, + "step": 19367 + }, + { + "epoch": 2.1269492642213925, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.833829879760742, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.72052001953125, + "num_tokens": 501202590.0, + "step": 19368 + }, + { + "epoch": 2.1270590819240063, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.646575689315796, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7282871603965759, + "num_tokens": 501224546.0, + "step": 19369 + }, + { + "epoch": 2.1271688996266196, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.642536163330078, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7281224727630615, + "num_tokens": 501246890.0, + "step": 19370 + }, + { + "epoch": 2.1272787173292333, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.633577346801758, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7250571846961975, + "num_tokens": 501271218.0, + "step": 19371 + }, + { + "epoch": 2.127388535031847, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.44238543510437, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7036517262458801, + "num_tokens": 501298334.0, + "step": 19372 + }, + { + "epoch": 2.127498352734461, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7527897357940674, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7246159911155701, + "num_tokens": 501318225.0, + "step": 19373 + }, + { + "epoch": 2.1276081704370746, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.736276626586914, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7353019714355469, + "num_tokens": 501337679.0, + "step": 19374 + }, + { + "epoch": 2.127717988139688, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6996939182281494, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7378004789352417, + "num_tokens": 501359800.0, + "step": 19375 + }, + { + "epoch": 2.1278278058423017, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4862418174743652, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7072751522064209, + "num_tokens": 501384640.0, + "step": 19376 + }, + { + "epoch": 2.1279376235449154, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.51082444190979, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7244566082954407, + "num_tokens": 501408346.0, + "step": 19377 + }, + { + "epoch": 2.128047441247529, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4594666957855225, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7243394255638123, + "num_tokens": 501434397.0, + "step": 19378 + }, + { + "epoch": 2.128157258950143, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.665860414505005, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7158462405204773, + "num_tokens": 501458928.0, + "step": 19379 + }, + { + "epoch": 2.1282670766527563, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.637357711791992, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7155782580375671, + "num_tokens": 501482911.0, + "step": 19380 + }, + { + "epoch": 2.12837689435537, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.377277135848999, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7111421823501587, + "num_tokens": 501511662.0, + "step": 19381 + }, + { + "epoch": 2.128486712057984, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2921981811523438, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7148441076278687, + "num_tokens": 501540540.0, + "step": 19382 + }, + { + "epoch": 2.1285965297605975, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3098132610321045, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6931490898132324, + "num_tokens": 501570794.0, + "step": 19383 + }, + { + "epoch": 2.1287063474632113, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4223034381866455, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6999067664146423, + "num_tokens": 501598342.0, + "step": 19384 + }, + { + "epoch": 2.1288161651658246, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7137861251831055, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7356265187263489, + "num_tokens": 501617992.0, + "step": 19385 + }, + { + "epoch": 2.1289259828684384, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.611011028289795, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7118589282035828, + "num_tokens": 501640359.0, + "step": 19386 + }, + { + "epoch": 2.129035800571052, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3342363834381104, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7214391231536865, + "num_tokens": 501666252.0, + "step": 19387 + }, + { + "epoch": 2.129145618273666, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.347459554672241, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.692252516746521, + "num_tokens": 501696625.0, + "step": 19388 + }, + { + "epoch": 2.129255435976279, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.590233087539673, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6929237842559814, + "num_tokens": 501721953.0, + "step": 19389 + }, + { + "epoch": 2.129365253678893, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.230358600616455, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.732953667640686, + "num_tokens": 501751758.0, + "step": 19390 + }, + { + "epoch": 2.1294750713815067, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3936805725097656, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7447929978370667, + "num_tokens": 501777933.0, + "step": 19391 + }, + { + "epoch": 2.1295848890841205, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3820512294769287, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6886849403381348, + "num_tokens": 501805384.0, + "step": 19392 + }, + { + "epoch": 2.1296947067867342, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.78981351852417, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7370060682296753, + "num_tokens": 501825156.0, + "step": 19393 + }, + { + "epoch": 2.1298045244893475, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.541116237640381, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7397860288619995, + "num_tokens": 501847690.0, + "step": 19394 + }, + { + "epoch": 2.1299143421919613, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6062772274017334, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7216141223907471, + "num_tokens": 501871064.0, + "step": 19395 + }, + { + "epoch": 2.130024159894575, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.443178176879883, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7012931704521179, + "num_tokens": 501898573.0, + "step": 19396 + }, + { + "epoch": 2.130133977597189, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3447389602661133, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7142837047576904, + "num_tokens": 501926272.0, + "step": 19397 + }, + { + "epoch": 2.130243795299802, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.340989589691162, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6879187226295471, + "num_tokens": 501954207.0, + "step": 19398 + }, + { + "epoch": 2.130353613002416, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5875186920166016, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7143370509147644, + "num_tokens": 501977226.0, + "step": 19399 + }, + { + "epoch": 2.1304634307050296, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.471940279006958, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7339575886726379, + "num_tokens": 502001304.0, + "step": 19400 + }, + { + "epoch": 2.1305732484076434, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6844780445098877, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7239406108856201, + "num_tokens": 502023302.0, + "step": 19401 + }, + { + "epoch": 2.130683066110257, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.354854106903076, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7173197865486145, + "num_tokens": 502048565.0, + "step": 19402 + }, + { + "epoch": 2.1307928838128705, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.631654977798462, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7060034275054932, + "num_tokens": 502071693.0, + "step": 19403 + }, + { + "epoch": 2.130902701515484, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2046427726745605, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6879329681396484, + "num_tokens": 502102292.0, + "step": 19404 + }, + { + "epoch": 2.131012519218098, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6284613609313965, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7319208383560181, + "num_tokens": 502124153.0, + "step": 19405 + }, + { + "epoch": 2.1311223369207117, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4257588386535645, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7295969128608704, + "num_tokens": 502149127.0, + "step": 19406 + }, + { + "epoch": 2.1312321546233255, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.411203622817993, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7251007556915283, + "num_tokens": 502175688.0, + "step": 19407 + }, + { + "epoch": 2.131341972325939, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.970217704772949, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7186675667762756, + "num_tokens": 502194729.0, + "step": 19408 + }, + { + "epoch": 2.1314517900285526, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.283567428588867, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7171129584312439, + "num_tokens": 502223616.0, + "step": 19409 + }, + { + "epoch": 2.1315616077311663, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6286096572875977, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7165403962135315, + "num_tokens": 502246377.0, + "step": 19410 + }, + { + "epoch": 2.13167142543378, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4601492881774902, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6999067664146423, + "num_tokens": 502274348.0, + "step": 19411 + }, + { + "epoch": 2.1317812431363934, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.182055950164795, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7252164483070374, + "num_tokens": 502304932.0, + "step": 19412 + }, + { + "epoch": 2.131891060839007, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1868886947631836, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7122220993041992, + "num_tokens": 502334295.0, + "step": 19413 + }, + { + "epoch": 2.132000878541621, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.287775754928589, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7158516645431519, + "num_tokens": 502361123.0, + "step": 19414 + }, + { + "epoch": 2.1321106962442347, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.473052740097046, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7259103655815125, + "num_tokens": 502385868.0, + "step": 19415 + }, + { + "epoch": 2.1322205139468484, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4176619052886963, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7512798309326172, + "num_tokens": 502411193.0, + "step": 19416 + }, + { + "epoch": 2.1323303316494617, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5357069969177246, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7160218358039856, + "num_tokens": 502436624.0, + "step": 19417 + }, + { + "epoch": 2.1324401493520755, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.359973192214966, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.698985755443573, + "num_tokens": 502463703.0, + "step": 19418 + }, + { + "epoch": 2.1325499670546892, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.492088794708252, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.712554931640625, + "num_tokens": 502490220.0, + "step": 19419 + }, + { + "epoch": 2.132659784757303, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1452722549438477, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7453639507293701, + "num_tokens": 502520829.0, + "step": 19420 + }, + { + "epoch": 2.1327696024599163, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.222860097885132, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7038508653640747, + "num_tokens": 502548809.0, + "step": 19421 + }, + { + "epoch": 2.13287942016253, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.38688063621521, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7242718935012817, + "num_tokens": 502572556.0, + "step": 19422 + }, + { + "epoch": 2.132989237865144, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3164079189300537, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7062485814094543, + "num_tokens": 502601085.0, + "step": 19423 + }, + { + "epoch": 2.1330990555677576, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5712788105010986, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7118965983390808, + "num_tokens": 502626949.0, + "step": 19424 + }, + { + "epoch": 2.1332088732703713, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6987903118133545, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7382354736328125, + "num_tokens": 502648130.0, + "step": 19425 + }, + { + "epoch": 2.1333186909729847, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.540808916091919, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6934107542037964, + "num_tokens": 502672859.0, + "step": 19426 + }, + { + "epoch": 2.1334285086755984, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2660722732543945, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6932001113891602, + "num_tokens": 502704382.0, + "step": 19427 + }, + { + "epoch": 2.133538326378212, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.29514741897583, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7261221408843994, + "num_tokens": 502734070.0, + "step": 19428 + }, + { + "epoch": 2.133648144080826, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 3.060433864593506, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7298189401626587, + "num_tokens": 502752470.0, + "step": 19429 + }, + { + "epoch": 2.1337579617834397, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4989278316497803, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.704902172088623, + "num_tokens": 502777691.0, + "step": 19430 + }, + { + "epoch": 2.133867779486053, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.561211585998535, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7136178016662598, + "num_tokens": 502802280.0, + "step": 19431 + }, + { + "epoch": 2.1339775971886668, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1174607276916504, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7076349258422852, + "num_tokens": 502836765.0, + "step": 19432 + }, + { + "epoch": 2.1340874148912805, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.295790672302246, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6996042728424072, + "num_tokens": 502865359.0, + "step": 19433 + }, + { + "epoch": 2.1341972325938943, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.429478883743286, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.713900625705719, + "num_tokens": 502890557.0, + "step": 19434 + }, + { + "epoch": 2.134307050296508, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2780840396881104, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7160658836364746, + "num_tokens": 502919872.0, + "step": 19435 + }, + { + "epoch": 2.1344168679991213, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7094080448150635, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6927045583724976, + "num_tokens": 502945931.0, + "step": 19436 + }, + { + "epoch": 2.134526685701735, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4108364582061768, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7135763168334961, + "num_tokens": 502973651.0, + "step": 19437 + }, + { + "epoch": 2.134636503404349, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.413165807723999, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7092709541320801, + "num_tokens": 502999893.0, + "step": 19438 + }, + { + "epoch": 2.1347463211069626, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8268933296203613, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7298088073730469, + "num_tokens": 503019470.0, + "step": 19439 + }, + { + "epoch": 2.134856138809576, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5529673099517822, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7136677503585815, + "num_tokens": 503043003.0, + "step": 19440 + }, + { + "epoch": 2.1349659565121897, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.310629367828369, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7022398710250854, + "num_tokens": 503072935.0, + "step": 19441 + }, + { + "epoch": 2.1350757742148034, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.411658763885498, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7452995181083679, + "num_tokens": 503097451.0, + "step": 19442 + }, + { + "epoch": 2.135185591917417, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.539029836654663, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7174392342567444, + "num_tokens": 503122261.0, + "step": 19443 + }, + { + "epoch": 2.135295409620031, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3843116760253906, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6996163129806519, + "num_tokens": 503152427.0, + "step": 19444 + }, + { + "epoch": 2.1354052273226443, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4773142337799072, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.697991132736206, + "num_tokens": 503177395.0, + "step": 19445 + }, + { + "epoch": 2.135515045025258, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.698625087738037, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.717829704284668, + "num_tokens": 503200906.0, + "step": 19446 + }, + { + "epoch": 2.1356248627278718, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4784767627716064, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7123032212257385, + "num_tokens": 503224808.0, + "step": 19447 + }, + { + "epoch": 2.1357346804304855, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.46386981010437, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7037361264228821, + "num_tokens": 503252145.0, + "step": 19448 + }, + { + "epoch": 2.135844498133099, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4268274307250977, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7320107817649841, + "num_tokens": 503280156.0, + "step": 19449 + }, + { + "epoch": 2.1359543158357126, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4904229640960693, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7355716824531555, + "num_tokens": 503305478.0, + "step": 19450 + }, + { + "epoch": 2.1360641335383264, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5086452960968018, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7085366249084473, + "num_tokens": 503328600.0, + "step": 19451 + }, + { + "epoch": 2.13617395124094, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.819432258605957, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.708613395690918, + "num_tokens": 503351610.0, + "step": 19452 + }, + { + "epoch": 2.136283768943554, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.840156316757202, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7430328726768494, + "num_tokens": 503370742.0, + "step": 19453 + }, + { + "epoch": 2.136393586646167, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.403801202774048, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7225263714790344, + "num_tokens": 503397469.0, + "step": 19454 + }, + { + "epoch": 2.136503404348781, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1811375617980957, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7209686636924744, + "num_tokens": 503431983.0, + "step": 19455 + }, + { + "epoch": 2.1366132220513947, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4284651279449463, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7237474918365479, + "num_tokens": 503460751.0, + "step": 19456 + }, + { + "epoch": 2.1367230397540085, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.55218505859375, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7106908559799194, + "num_tokens": 503485362.0, + "step": 19457 + }, + { + "epoch": 2.136832857456622, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4883179664611816, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7367260456085205, + "num_tokens": 503508116.0, + "step": 19458 + }, + { + "epoch": 2.1369426751592355, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.363424301147461, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7124048471450806, + "num_tokens": 503535310.0, + "step": 19459 + }, + { + "epoch": 2.1370524928618493, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.386763572692871, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6835368871688843, + "num_tokens": 503565033.0, + "step": 19460 + }, + { + "epoch": 2.137162310564463, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6698904037475586, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7384355068206787, + "num_tokens": 503585580.0, + "step": 19461 + }, + { + "epoch": 2.137272128267077, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.462374687194824, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7232013940811157, + "num_tokens": 503610320.0, + "step": 19462 + }, + { + "epoch": 2.13738194596969, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.404771566390991, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7248717546463013, + "num_tokens": 503636171.0, + "step": 19463 + }, + { + "epoch": 2.137491763672304, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1888511180877686, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7096412777900696, + "num_tokens": 503667294.0, + "step": 19464 + }, + { + "epoch": 2.1376015813749176, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3964593410491943, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7275948524475098, + "num_tokens": 503693320.0, + "step": 19465 + }, + { + "epoch": 2.1377113990775314, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4609673023223877, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7279513478279114, + "num_tokens": 503718333.0, + "step": 19466 + }, + { + "epoch": 2.137821216780145, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.148627758026123, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7205500602722168, + "num_tokens": 503750800.0, + "step": 19467 + }, + { + "epoch": 2.1379310344827585, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3542895317077637, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7376742362976074, + "num_tokens": 503776981.0, + "step": 19468 + }, + { + "epoch": 2.138040852185372, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2014663219451904, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7265239953994751, + "num_tokens": 503805743.0, + "step": 19469 + }, + { + "epoch": 2.138150669887986, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.601771116256714, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.739477276802063, + "num_tokens": 503828450.0, + "step": 19470 + }, + { + "epoch": 2.1382604875905997, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6515755653381348, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7248458862304688, + "num_tokens": 503851585.0, + "step": 19471 + }, + { + "epoch": 2.138370305293213, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3997018337249756, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.724332869052887, + "num_tokens": 503877200.0, + "step": 19472 + }, + { + "epoch": 2.138480122995827, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6379594802856445, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7128477692604065, + "num_tokens": 503901070.0, + "step": 19473 + }, + { + "epoch": 2.1385899406984406, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.54191517829895, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7071036100387573, + "num_tokens": 503924246.0, + "step": 19474 + }, + { + "epoch": 2.1386997584010543, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3062734603881836, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7035696506500244, + "num_tokens": 503954041.0, + "step": 19475 + }, + { + "epoch": 2.138809576103668, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.407820463180542, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7158800363540649, + "num_tokens": 503981062.0, + "step": 19476 + }, + { + "epoch": 2.1389193938062814, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.9139764308929443, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7175407409667969, + "num_tokens": 504002281.0, + "step": 19477 + }, + { + "epoch": 2.139029211508895, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3703556060791016, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7244164943695068, + "num_tokens": 504032604.0, + "step": 19478 + }, + { + "epoch": 2.139139029211509, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.29095458984375, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7047276496887207, + "num_tokens": 504061406.0, + "step": 19479 + }, + { + "epoch": 2.1392488469141226, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6554386615753174, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7240555286407471, + "num_tokens": 504082777.0, + "step": 19480 + }, + { + "epoch": 2.1393586646167364, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4660706520080566, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7088081240653992, + "num_tokens": 504107983.0, + "step": 19481 + }, + { + "epoch": 2.1394684823193497, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3082709312438965, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6887606382369995, + "num_tokens": 504139751.0, + "step": 19482 + }, + { + "epoch": 2.1395783000219635, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.497727870941162, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7141757607460022, + "num_tokens": 504163631.0, + "step": 19483 + }, + { + "epoch": 2.1396881177245772, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.524428367614746, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.7054439783096313, + "num_tokens": 504188728.0, + "step": 19484 + }, + { + "epoch": 2.139797935427191, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.705928087234497, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.710584282875061, + "num_tokens": 504214656.0, + "step": 19485 + }, + { + "epoch": 2.1399077531298047, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5774877071380615, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7475589513778687, + "num_tokens": 504236576.0, + "step": 19486 + }, + { + "epoch": 2.140017570832418, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3962478637695312, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.720573902130127, + "num_tokens": 504263587.0, + "step": 19487 + }, + { + "epoch": 2.140127388535032, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5263285636901855, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7265878319740295, + "num_tokens": 504286875.0, + "step": 19488 + }, + { + "epoch": 2.1402372062376456, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5378527641296387, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7311447858810425, + "num_tokens": 504310774.0, + "step": 19489 + }, + { + "epoch": 2.1403470239402593, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6968162059783936, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7366136312484741, + "num_tokens": 504332683.0, + "step": 19490 + }, + { + "epoch": 2.1404568416428726, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.559236526489258, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6861514449119568, + "num_tokens": 504359380.0, + "step": 19491 + }, + { + "epoch": 2.1405666593454864, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.363980770111084, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7155565023422241, + "num_tokens": 504389134.0, + "step": 19492 + }, + { + "epoch": 2.1406764770481, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5330467224121094, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7034448385238647, + "num_tokens": 504414439.0, + "step": 19493 + }, + { + "epoch": 2.140786294750714, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6413495540618896, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7057130932807922, + "num_tokens": 504437166.0, + "step": 19494 + }, + { + "epoch": 2.1408961124533277, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6312921047210693, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7170244455337524, + "num_tokens": 504460330.0, + "step": 19495 + }, + { + "epoch": 2.141005930155941, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.41689395904541, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6981611847877502, + "num_tokens": 504487957.0, + "step": 19496 + }, + { + "epoch": 2.1411157478585547, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2981274127960205, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7179402112960815, + "num_tokens": 504516567.0, + "step": 19497 + }, + { + "epoch": 2.1412255655611685, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.393247604370117, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7137807607650757, + "num_tokens": 504545127.0, + "step": 19498 + }, + { + "epoch": 2.1413353832637823, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 3.071396589279175, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7273818850517273, + "num_tokens": 504563497.0, + "step": 19499 + }, + { + "epoch": 2.1414452009663956, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.479215621948242, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7330408096313477, + "num_tokens": 504587343.0, + "step": 19500 + }, + { + "epoch": 2.1415550186690093, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2023069858551025, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7127513885498047, + "num_tokens": 504620265.0, + "step": 19501 + }, + { + "epoch": 2.141664836371623, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.785332441329956, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7523362040519714, + "num_tokens": 504638830.0, + "step": 19502 + }, + { + "epoch": 2.141774654074237, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6062827110290527, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7164045572280884, + "num_tokens": 504661914.0, + "step": 19503 + }, + { + "epoch": 2.1418844717768506, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.341473340988159, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.709966778755188, + "num_tokens": 504689335.0, + "step": 19504 + }, + { + "epoch": 2.141994289479464, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2472481727600098, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.712814211845398, + "num_tokens": 504718675.0, + "step": 19505 + }, + { + "epoch": 2.1421041071820777, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.605905771255493, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7271705865859985, + "num_tokens": 504741613.0, + "step": 19506 + }, + { + "epoch": 2.1422139248846914, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2700934410095215, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7157813310623169, + "num_tokens": 504769859.0, + "step": 19507 + }, + { + "epoch": 2.142323742587305, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.350029945373535, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7305445075035095, + "num_tokens": 504797627.0, + "step": 19508 + }, + { + "epoch": 2.142433560289919, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.758610963821411, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7282859683036804, + "num_tokens": 504820414.0, + "step": 19509 + }, + { + "epoch": 2.1425433779925322, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5079290866851807, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7118746042251587, + "num_tokens": 504844428.0, + "step": 19510 + }, + { + "epoch": 2.142653195695146, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4898838996887207, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6860198378562927, + "num_tokens": 504871808.0, + "step": 19511 + }, + { + "epoch": 2.1427630133977598, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.518847703933716, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7122737765312195, + "num_tokens": 504897352.0, + "step": 19512 + }, + { + "epoch": 2.1428728311003735, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5665841102600098, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7262252569198608, + "num_tokens": 504920529.0, + "step": 19513 + }, + { + "epoch": 2.142982648802987, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6894052028656006, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7195465564727783, + "num_tokens": 504943383.0, + "step": 19514 + }, + { + "epoch": 2.1430924665056006, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.360466957092285, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7178564071655273, + "num_tokens": 504972413.0, + "step": 19515 + }, + { + "epoch": 2.1432022842082143, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8035616874694824, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7207825779914856, + "num_tokens": 504994458.0, + "step": 19516 + }, + { + "epoch": 2.143312101910828, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.485666275024414, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7252799272537231, + "num_tokens": 505022774.0, + "step": 19517 + }, + { + "epoch": 2.143421919613442, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.627150535583496, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7163151502609253, + "num_tokens": 505046288.0, + "step": 19518 + }, + { + "epoch": 2.143531737316055, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.515141010284424, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.719251275062561, + "num_tokens": 505073053.0, + "step": 19519 + }, + { + "epoch": 2.143641555018669, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4450583457946777, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7097824811935425, + "num_tokens": 505101192.0, + "step": 19520 + }, + { + "epoch": 2.1437513727212827, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5178091526031494, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7119917869567871, + "num_tokens": 505127222.0, + "step": 19521 + }, + { + "epoch": 2.1438611904238964, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.390500545501709, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6890350580215454, + "num_tokens": 505155887.0, + "step": 19522 + }, + { + "epoch": 2.1439710081265098, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3379104137420654, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7112818360328674, + "num_tokens": 505184600.0, + "step": 19523 + }, + { + "epoch": 2.1440808258291235, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6542859077453613, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7065551280975342, + "num_tokens": 505208945.0, + "step": 19524 + }, + { + "epoch": 2.1441906435317373, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7075233459472656, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7387757897377014, + "num_tokens": 505232164.0, + "step": 19525 + }, + { + "epoch": 2.144300461234351, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3558297157287598, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7002251744270325, + "num_tokens": 505260252.0, + "step": 19526 + }, + { + "epoch": 2.144410278936965, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.579010248184204, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7333898544311523, + "num_tokens": 505282177.0, + "step": 19527 + }, + { + "epoch": 2.144520096639578, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5778844356536865, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7163835167884827, + "num_tokens": 505305190.0, + "step": 19528 + }, + { + "epoch": 2.144629914342192, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5750653743743896, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7265344858169556, + "num_tokens": 505329180.0, + "step": 19529 + }, + { + "epoch": 2.1447397320448056, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.23530912399292, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7261319160461426, + "num_tokens": 505358431.0, + "step": 19530 + }, + { + "epoch": 2.1448495497474194, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2362468242645264, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7019724249839783, + "num_tokens": 505388397.0, + "step": 19531 + }, + { + "epoch": 2.144959367450033, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.252594232559204, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7172159552574158, + "num_tokens": 505420726.0, + "step": 19532 + }, + { + "epoch": 2.1450691851526464, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.561589241027832, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7318071722984314, + "num_tokens": 505447288.0, + "step": 19533 + }, + { + "epoch": 2.14517900285526, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.199820041656494, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7100865840911865, + "num_tokens": 505481821.0, + "step": 19534 + }, + { + "epoch": 2.145288820557874, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3914105892181396, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7113313674926758, + "num_tokens": 505510728.0, + "step": 19535 + }, + { + "epoch": 2.1453986382604877, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6919946670532227, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.7002121806144714, + "num_tokens": 505533522.0, + "step": 19536 + }, + { + "epoch": 2.1455084559631015, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.427377700805664, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.7002805471420288, + "num_tokens": 505561078.0, + "step": 19537 + }, + { + "epoch": 2.145618273665715, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.214352607727051, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6952718496322632, + "num_tokens": 505593858.0, + "step": 19538 + }, + { + "epoch": 2.1457280913683285, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8178584575653076, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7110267281532288, + "num_tokens": 505619141.0, + "step": 19539 + }, + { + "epoch": 2.1458379090709423, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5995140075683594, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.734427809715271, + "num_tokens": 505645105.0, + "step": 19540 + }, + { + "epoch": 2.145947726773556, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.316314220428467, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6973423957824707, + "num_tokens": 505675618.0, + "step": 19541 + }, + { + "epoch": 2.1460575444761694, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4747838973999023, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7062286734580994, + "num_tokens": 505701316.0, + "step": 19542 + }, + { + "epoch": 2.146167362178783, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.345541477203369, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.7024107575416565, + "num_tokens": 505728786.0, + "step": 19543 + }, + { + "epoch": 2.146277179881397, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.474400758743286, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7306153178215027, + "num_tokens": 505754729.0, + "step": 19544 + }, + { + "epoch": 2.1463869975840106, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.440627336502075, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6943142414093018, + "num_tokens": 505786922.0, + "step": 19545 + }, + { + "epoch": 2.1464968152866244, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.454265594482422, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7135120630264282, + "num_tokens": 505816400.0, + "step": 19546 + }, + { + "epoch": 2.1466066329892377, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.473271131515503, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7488483190536499, + "num_tokens": 505839740.0, + "step": 19547 + }, + { + "epoch": 2.1467164506918515, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.352975606918335, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.722690761089325, + "num_tokens": 505868143.0, + "step": 19548 + }, + { + "epoch": 2.146826268394465, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.495248317718506, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.706504225730896, + "num_tokens": 505894277.0, + "step": 19549 + }, + { + "epoch": 2.146936086097079, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4543046951293945, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7153633832931519, + "num_tokens": 505919504.0, + "step": 19550 + }, + { + "epoch": 2.1470459037996923, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6405138969421387, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7145298719406128, + "num_tokens": 505942117.0, + "step": 19551 + }, + { + "epoch": 2.147155721502306, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.617790699005127, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.706838071346283, + "num_tokens": 505966337.0, + "step": 19552 + }, + { + "epoch": 2.14726553920492, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5460364818573, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7043901085853577, + "num_tokens": 505991024.0, + "step": 19553 + }, + { + "epoch": 2.1473753569075336, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2567672729492188, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.713516116142273, + "num_tokens": 506018116.0, + "step": 19554 + }, + { + "epoch": 2.1474851746101473, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4617507457733154, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7107059955596924, + "num_tokens": 506042599.0, + "step": 19555 + }, + { + "epoch": 2.1475949923127606, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3349769115448, + "learning_rate": 1e-06, + "loss": 1.0801, + "mean_token_accuracy": 0.6819956302642822, + "num_tokens": 506071190.0, + "step": 19556 + }, + { + "epoch": 2.1477048100153744, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3405771255493164, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7201429009437561, + "num_tokens": 506101070.0, + "step": 19557 + }, + { + "epoch": 2.147814627717988, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.47255277633667, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7403430938720703, + "num_tokens": 506123347.0, + "step": 19558 + }, + { + "epoch": 2.147924445420602, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6224193572998047, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7122005224227905, + "num_tokens": 506146118.0, + "step": 19559 + }, + { + "epoch": 2.1480342631232157, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.496474504470825, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6996474862098694, + "num_tokens": 506174054.0, + "step": 19560 + }, + { + "epoch": 2.148144080825829, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.583678960800171, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7296474575996399, + "num_tokens": 506197243.0, + "step": 19561 + }, + { + "epoch": 2.1482538985284427, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.228173017501831, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6993967294692993, + "num_tokens": 506229973.0, + "step": 19562 + }, + { + "epoch": 2.1483637162310565, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6907849311828613, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7188727855682373, + "num_tokens": 506250745.0, + "step": 19563 + }, + { + "epoch": 2.1484735339336702, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2147459983825684, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.7066746950149536, + "num_tokens": 506282315.0, + "step": 19564 + }, + { + "epoch": 2.148583351636284, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.149355411529541, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7049912810325623, + "num_tokens": 506316308.0, + "step": 19565 + }, + { + "epoch": 2.1486931693388973, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.320725440979004, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7085466384887695, + "num_tokens": 506345180.0, + "step": 19566 + }, + { + "epoch": 2.148802987041511, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5170412063598633, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7240332365036011, + "num_tokens": 506369076.0, + "step": 19567 + }, + { + "epoch": 2.148912804744125, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.567566156387329, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6996188163757324, + "num_tokens": 506393479.0, + "step": 19568 + }, + { + "epoch": 2.1490226224467386, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3642585277557373, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7163610458374023, + "num_tokens": 506419932.0, + "step": 19569 + }, + { + "epoch": 2.149132440149352, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3432655334472656, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7087512612342834, + "num_tokens": 506448953.0, + "step": 19570 + }, + { + "epoch": 2.1492422578519657, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2767887115478516, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7164076566696167, + "num_tokens": 506478305.0, + "step": 19571 + }, + { + "epoch": 2.1493520755545794, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4439399242401123, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7182952165603638, + "num_tokens": 506503441.0, + "step": 19572 + }, + { + "epoch": 2.149461893257193, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5362648963928223, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.7026406526565552, + "num_tokens": 506527930.0, + "step": 19573 + }, + { + "epoch": 2.149571710959807, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.341888189315796, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.703933596611023, + "num_tokens": 506557797.0, + "step": 19574 + }, + { + "epoch": 2.1496815286624202, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5391664505004883, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7303164005279541, + "num_tokens": 506584377.0, + "step": 19575 + }, + { + "epoch": 2.149791346365034, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 3.025263547897339, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.717704176902771, + "num_tokens": 506603088.0, + "step": 19576 + }, + { + "epoch": 2.1499011640676478, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4202282428741455, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6999967098236084, + "num_tokens": 506630461.0, + "step": 19577 + }, + { + "epoch": 2.1500109817702615, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4188833236694336, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7481136918067932, + "num_tokens": 506654651.0, + "step": 19578 + }, + { + "epoch": 2.150120799472875, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.225243330001831, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6979002952575684, + "num_tokens": 506685387.0, + "step": 19579 + }, + { + "epoch": 2.1502306171754886, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.344862937927246, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7405366897583008, + "num_tokens": 506710343.0, + "step": 19580 + }, + { + "epoch": 2.1503404348781023, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3005471229553223, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7408567070960999, + "num_tokens": 506736132.0, + "step": 19581 + }, + { + "epoch": 2.150450252580716, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.739274263381958, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7165066003799438, + "num_tokens": 506757653.0, + "step": 19582 + }, + { + "epoch": 2.15056007028333, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.861966371536255, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7219847440719604, + "num_tokens": 506777449.0, + "step": 19583 + }, + { + "epoch": 2.150669887985943, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.552919626235962, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6959413290023804, + "num_tokens": 506804917.0, + "step": 19584 + }, + { + "epoch": 2.150779705688557, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.394986152648926, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7272766828536987, + "num_tokens": 506831845.0, + "step": 19585 + }, + { + "epoch": 2.1508895233911707, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6128947734832764, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7097359895706177, + "num_tokens": 506858898.0, + "step": 19586 + }, + { + "epoch": 2.1509993410937844, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7740890979766846, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7204527854919434, + "num_tokens": 506881919.0, + "step": 19587 + }, + { + "epoch": 2.151109158796398, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5945980548858643, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7284128665924072, + "num_tokens": 506907384.0, + "step": 19588 + }, + { + "epoch": 2.1512189764990115, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7151098251342773, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7060015201568604, + "num_tokens": 506931758.0, + "step": 19589 + }, + { + "epoch": 2.1513287942016253, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.841289758682251, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7399853467941284, + "num_tokens": 506950804.0, + "step": 19590 + }, + { + "epoch": 2.151438611904239, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7407386302948, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.71990966796875, + "num_tokens": 506973831.0, + "step": 19591 + }, + { + "epoch": 2.1515484296068528, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4610564708709717, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6928794384002686, + "num_tokens": 507002361.0, + "step": 19592 + }, + { + "epoch": 2.151658247309466, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2718305587768555, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.71858811378479, + "num_tokens": 507031774.0, + "step": 19593 + }, + { + "epoch": 2.15176806501208, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4431233406066895, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.693415105342865, + "num_tokens": 507056790.0, + "step": 19594 + }, + { + "epoch": 2.1518778827146936, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.552093505859375, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.702763020992279, + "num_tokens": 507082694.0, + "step": 19595 + }, + { + "epoch": 2.1519877004173074, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.864046812057495, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7297854423522949, + "num_tokens": 507102802.0, + "step": 19596 + }, + { + "epoch": 2.152097518119921, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4870057106018066, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7398504614830017, + "num_tokens": 507126400.0, + "step": 19597 + }, + { + "epoch": 2.1522073358225344, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4257047176361084, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7248415350914001, + "num_tokens": 507153282.0, + "step": 19598 + }, + { + "epoch": 2.152317153525148, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.756010055541992, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7328953742980957, + "num_tokens": 507174702.0, + "step": 19599 + }, + { + "epoch": 2.152426971227762, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3727285861968994, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7066811323165894, + "num_tokens": 507202298.0, + "step": 19600 + }, + { + "epoch": 2.1525367889303757, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5833675861358643, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7006494998931885, + "num_tokens": 507227837.0, + "step": 19601 + }, + { + "epoch": 2.152646606632989, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.38871169090271, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7075992822647095, + "num_tokens": 507255427.0, + "step": 19602 + }, + { + "epoch": 2.1527564243356028, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4235434532165527, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7154979705810547, + "num_tokens": 507282890.0, + "step": 19603 + }, + { + "epoch": 2.1528662420382165, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.507918119430542, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7312878966331482, + "num_tokens": 507306909.0, + "step": 19604 + }, + { + "epoch": 2.1529760597408303, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.063424587249756, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7306574583053589, + "num_tokens": 507338753.0, + "step": 19605 + }, + { + "epoch": 2.153085877443444, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6504971981048584, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7042263746261597, + "num_tokens": 507360979.0, + "step": 19606 + }, + { + "epoch": 2.1531956951460574, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2036924362182617, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7202521562576294, + "num_tokens": 507390687.0, + "step": 19607 + }, + { + "epoch": 2.153305512848671, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5771477222442627, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.689416766166687, + "num_tokens": 507415851.0, + "step": 19608 + }, + { + "epoch": 2.153415330551285, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8910603523254395, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.719708263874054, + "num_tokens": 507436547.0, + "step": 19609 + }, + { + "epoch": 2.1535251482538986, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7059803009033203, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7534018158912659, + "num_tokens": 507456399.0, + "step": 19610 + }, + { + "epoch": 2.1536349659565124, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4183528423309326, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6980311274528503, + "num_tokens": 507483644.0, + "step": 19611 + }, + { + "epoch": 2.1537447836591257, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4881272315979004, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7225716710090637, + "num_tokens": 507507416.0, + "step": 19612 + }, + { + "epoch": 2.1538546013617395, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.452364921569824, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6843596696853638, + "num_tokens": 507534168.0, + "step": 19613 + }, + { + "epoch": 2.153964419064353, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.368558645248413, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7208420634269714, + "num_tokens": 507560904.0, + "step": 19614 + }, + { + "epoch": 2.154074236766967, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7403316497802734, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7405160665512085, + "num_tokens": 507583246.0, + "step": 19615 + }, + { + "epoch": 2.1541840544695807, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3846070766448975, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7273174524307251, + "num_tokens": 507608366.0, + "step": 19616 + }, + { + "epoch": 2.154293872172194, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.448561429977417, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7084988355636597, + "num_tokens": 507637581.0, + "step": 19617 + }, + { + "epoch": 2.154403689874808, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.898740768432617, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7211806178092957, + "num_tokens": 507657453.0, + "step": 19618 + }, + { + "epoch": 2.1545135075774215, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4357194900512695, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6965832114219666, + "num_tokens": 507685092.0, + "step": 19619 + }, + { + "epoch": 2.1546233252800353, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.44889760017395, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7197472453117371, + "num_tokens": 507710248.0, + "step": 19620 + }, + { + "epoch": 2.1547331429826486, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5453126430511475, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.739492654800415, + "num_tokens": 507735881.0, + "step": 19621 + }, + { + "epoch": 2.1548429606852624, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4033496379852295, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.707482635974884, + "num_tokens": 507762092.0, + "step": 19622 + }, + { + "epoch": 2.154952778387876, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.315495491027832, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.700875997543335, + "num_tokens": 507789685.0, + "step": 19623 + }, + { + "epoch": 2.15506259609049, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5295488834381104, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6981716156005859, + "num_tokens": 507815097.0, + "step": 19624 + }, + { + "epoch": 2.1551724137931036, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.50849986076355, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7216825485229492, + "num_tokens": 507839109.0, + "step": 19625 + }, + { + "epoch": 2.155282231495717, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5846352577209473, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6925681829452515, + "num_tokens": 507864532.0, + "step": 19626 + }, + { + "epoch": 2.1553920491983307, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7870538234710693, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7163775563240051, + "num_tokens": 507886816.0, + "step": 19627 + }, + { + "epoch": 2.1555018669009445, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.570702075958252, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.733372688293457, + "num_tokens": 507910570.0, + "step": 19628 + }, + { + "epoch": 2.1556116846035582, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7223596572875977, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7381336688995361, + "num_tokens": 507930748.0, + "step": 19629 + }, + { + "epoch": 2.1557215023061715, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6182234287261963, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7052673101425171, + "num_tokens": 507954800.0, + "step": 19630 + }, + { + "epoch": 2.1558313200087853, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.934039354324341, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7217268943786621, + "num_tokens": 507974285.0, + "step": 19631 + }, + { + "epoch": 2.155941137711399, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3245232105255127, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7265835404396057, + "num_tokens": 508001138.0, + "step": 19632 + }, + { + "epoch": 2.156050955414013, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3368043899536133, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7189512848854065, + "num_tokens": 508027055.0, + "step": 19633 + }, + { + "epoch": 2.1561607731166266, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.333394765853882, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7009845972061157, + "num_tokens": 508055242.0, + "step": 19634 + }, + { + "epoch": 2.15627059081924, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.336414337158203, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6946837902069092, + "num_tokens": 508084001.0, + "step": 19635 + }, + { + "epoch": 2.1563804085218536, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2699849605560303, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7186365127563477, + "num_tokens": 508111763.0, + "step": 19636 + }, + { + "epoch": 2.1564902262244674, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.440896987915039, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7256920337677002, + "num_tokens": 508137475.0, + "step": 19637 + }, + { + "epoch": 2.156600043927081, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2857751846313477, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.713358998298645, + "num_tokens": 508165164.0, + "step": 19638 + }, + { + "epoch": 2.156709861629695, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.992602586746216, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7398704886436462, + "num_tokens": 508183366.0, + "step": 19639 + }, + { + "epoch": 2.1568196793323082, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4166297912597656, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7019805312156677, + "num_tokens": 508210922.0, + "step": 19640 + }, + { + "epoch": 2.156929497034922, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.554246664047241, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7096348404884338, + "num_tokens": 508233612.0, + "step": 19641 + }, + { + "epoch": 2.1570393147375357, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3273773193359375, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6999974846839905, + "num_tokens": 508262678.0, + "step": 19642 + }, + { + "epoch": 2.1571491324401495, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.441878080368042, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7034578919410706, + "num_tokens": 508290696.0, + "step": 19643 + }, + { + "epoch": 2.157258950142763, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.163447618484497, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7273557186126709, + "num_tokens": 508321748.0, + "step": 19644 + }, + { + "epoch": 2.1573687678453766, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.425642490386963, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.719569981098175, + "num_tokens": 508345438.0, + "step": 19645 + }, + { + "epoch": 2.1574785855479903, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3592469692230225, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7009618282318115, + "num_tokens": 508372099.0, + "step": 19646 + }, + { + "epoch": 2.157588403250604, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4000303745269775, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7200434803962708, + "num_tokens": 508398974.0, + "step": 19647 + }, + { + "epoch": 2.157698220953218, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3272809982299805, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7107276320457458, + "num_tokens": 508428562.0, + "step": 19648 + }, + { + "epoch": 2.157808038655831, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 3.0905239582061768, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6974388957023621, + "num_tokens": 508456487.0, + "step": 19649 + }, + { + "epoch": 2.157917856358445, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6025712490081787, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7243523597717285, + "num_tokens": 508481014.0, + "step": 19650 + }, + { + "epoch": 2.1580276740610587, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5385830402374268, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7190629243850708, + "num_tokens": 508506398.0, + "step": 19651 + }, + { + "epoch": 2.1581374917636724, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6981890201568604, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7258888483047485, + "num_tokens": 508530440.0, + "step": 19652 + }, + { + "epoch": 2.1582473094662857, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.7195935249328613, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7072436809539795, + "num_tokens": 508553552.0, + "step": 19653 + }, + { + "epoch": 2.1583571271688995, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.546879529953003, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.732738733291626, + "num_tokens": 508579793.0, + "step": 19654 + }, + { + "epoch": 2.1584669448715132, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.124990224838257, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.700415849685669, + "num_tokens": 508614825.0, + "step": 19655 + }, + { + "epoch": 2.158576762574127, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.698108434677124, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7290666103363037, + "num_tokens": 508636198.0, + "step": 19656 + }, + { + "epoch": 2.1586865802767408, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2482035160064697, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7095034122467041, + "num_tokens": 508665107.0, + "step": 19657 + }, + { + "epoch": 2.158796397979354, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2243218421936035, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7162439227104187, + "num_tokens": 508692624.0, + "step": 19658 + }, + { + "epoch": 2.158906215681968, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.500657320022583, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7201218008995056, + "num_tokens": 508716756.0, + "step": 19659 + }, + { + "epoch": 2.1590160333845816, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.307600259780884, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7086261510848999, + "num_tokens": 508745627.0, + "step": 19660 + }, + { + "epoch": 2.1591258510871953, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6383450031280518, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7011086940765381, + "num_tokens": 508770712.0, + "step": 19661 + }, + { + "epoch": 2.159235668789809, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2653720378875732, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6853556632995605, + "num_tokens": 508800724.0, + "step": 19662 + }, + { + "epoch": 2.1593454864924224, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4559147357940674, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7169123291969299, + "num_tokens": 508826714.0, + "step": 19663 + }, + { + "epoch": 2.159455304195036, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.529453992843628, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7123001217842102, + "num_tokens": 508852337.0, + "step": 19664 + }, + { + "epoch": 2.15956512189765, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.952559232711792, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.707394003868103, + "num_tokens": 508872900.0, + "step": 19665 + }, + { + "epoch": 2.1596749396002637, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5256388187408447, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7243943214416504, + "num_tokens": 508898461.0, + "step": 19666 + }, + { + "epoch": 2.1597847573028774, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3705995082855225, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7193660140037537, + "num_tokens": 508925970.0, + "step": 19667 + }, + { + "epoch": 2.1598945750054908, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2693045139312744, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.706428587436676, + "num_tokens": 508954639.0, + "step": 19668 + }, + { + "epoch": 2.1600043927081045, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.285585403442383, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7000857591629028, + "num_tokens": 508987944.0, + "step": 19669 + }, + { + "epoch": 2.1601142104107183, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 3.19724178314209, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.719732403755188, + "num_tokens": 509006625.0, + "step": 19670 + }, + { + "epoch": 2.160224028113332, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.297478199005127, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7075499892234802, + "num_tokens": 509036342.0, + "step": 19671 + }, + { + "epoch": 2.1603338458159453, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3292434215545654, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7001487016677856, + "num_tokens": 509064156.0, + "step": 19672 + }, + { + "epoch": 2.160443663518559, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4468483924865723, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7174644470214844, + "num_tokens": 509088825.0, + "step": 19673 + }, + { + "epoch": 2.160553481221173, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1879160404205322, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.707679271697998, + "num_tokens": 509120868.0, + "step": 19674 + }, + { + "epoch": 2.1606632989237866, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.211107015609741, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6934542655944824, + "num_tokens": 509150995.0, + "step": 19675 + }, + { + "epoch": 2.1607731166264004, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.307098388671875, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7302478551864624, + "num_tokens": 509177385.0, + "step": 19676 + }, + { + "epoch": 2.1608829343290137, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5691843032836914, + "learning_rate": 1e-06, + "loss": 1.101, + "mean_token_accuracy": 0.6796205043792725, + "num_tokens": 509203424.0, + "step": 19677 + }, + { + "epoch": 2.1609927520316274, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.640638828277588, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.7007898092269897, + "num_tokens": 509228899.0, + "step": 19678 + }, + { + "epoch": 2.161102569734241, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.352942705154419, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7234487533569336, + "num_tokens": 509256290.0, + "step": 19679 + }, + { + "epoch": 2.161212387436855, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.310957193374634, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.715172290802002, + "num_tokens": 509283170.0, + "step": 19680 + }, + { + "epoch": 2.1613222051394683, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3505971431732178, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7023799419403076, + "num_tokens": 509312279.0, + "step": 19681 + }, + { + "epoch": 2.161432022842082, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4591619968414307, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7272119522094727, + "num_tokens": 509338350.0, + "step": 19682 + }, + { + "epoch": 2.161541840544696, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3081765174865723, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7202376127243042, + "num_tokens": 509366366.0, + "step": 19683 + }, + { + "epoch": 2.1616516582473095, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8695170879364014, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7607376575469971, + "num_tokens": 509383927.0, + "step": 19684 + }, + { + "epoch": 2.1617614759499233, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4490461349487305, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7004491090774536, + "num_tokens": 509411164.0, + "step": 19685 + }, + { + "epoch": 2.1618712936525366, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.592902660369873, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7037748098373413, + "num_tokens": 509436736.0, + "step": 19686 + }, + { + "epoch": 2.1619811113551504, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5274910926818848, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6934480667114258, + "num_tokens": 509462765.0, + "step": 19687 + }, + { + "epoch": 2.162090929057764, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6508450508117676, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.73119056224823, + "num_tokens": 509485486.0, + "step": 19688 + }, + { + "epoch": 2.162200746760378, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5137481689453125, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7352182865142822, + "num_tokens": 509511127.0, + "step": 19689 + }, + { + "epoch": 2.1623105644629916, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4656457901000977, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.728240966796875, + "num_tokens": 509536626.0, + "step": 19690 + }, + { + "epoch": 2.162420382165605, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5325610637664795, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.733856737613678, + "num_tokens": 509560467.0, + "step": 19691 + }, + { + "epoch": 2.1625301998682187, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.548621654510498, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6997041702270508, + "num_tokens": 509587425.0, + "step": 19692 + }, + { + "epoch": 2.1626400175708325, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5730857849121094, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7282693386077881, + "num_tokens": 509610781.0, + "step": 19693 + }, + { + "epoch": 2.162749835273446, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.584272623062134, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7159596681594849, + "num_tokens": 509635032.0, + "step": 19694 + }, + { + "epoch": 2.16285965297606, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3971381187438965, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7111271619796753, + "num_tokens": 509663336.0, + "step": 19695 + }, + { + "epoch": 2.1629694706786733, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.754530429840088, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6932566165924072, + "num_tokens": 509687814.0, + "step": 19696 + }, + { + "epoch": 2.163079288381287, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.448381185531616, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7045376300811768, + "num_tokens": 509713456.0, + "step": 19697 + }, + { + "epoch": 2.163189106083901, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.987084150314331, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7391040325164795, + "num_tokens": 509735769.0, + "step": 19698 + }, + { + "epoch": 2.1632989237865146, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.578911542892456, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7096474170684814, + "num_tokens": 509761707.0, + "step": 19699 + }, + { + "epoch": 2.163408741489128, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1740641593933105, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7193940281867981, + "num_tokens": 509793233.0, + "step": 19700 + }, + { + "epoch": 2.1635185591917416, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2585690021514893, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7111492156982422, + "num_tokens": 509820851.0, + "step": 19701 + }, + { + "epoch": 2.1636283768943554, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.27022385597229, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7143939733505249, + "num_tokens": 509848138.0, + "step": 19702 + }, + { + "epoch": 2.163738194596969, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4025769233703613, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7103699445724487, + "num_tokens": 509875119.0, + "step": 19703 + }, + { + "epoch": 2.1638480122995825, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3927409648895264, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7170637249946594, + "num_tokens": 509902039.0, + "step": 19704 + }, + { + "epoch": 2.163957830002196, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4328794479370117, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7218492031097412, + "num_tokens": 509926147.0, + "step": 19705 + }, + { + "epoch": 2.16406764770481, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6169373989105225, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7066081762313843, + "num_tokens": 509949592.0, + "step": 19706 + }, + { + "epoch": 2.1641774654074237, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.410931348800659, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7208766937255859, + "num_tokens": 509975177.0, + "step": 19707 + }, + { + "epoch": 2.1642872831100375, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4470486640930176, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7244031429290771, + "num_tokens": 509998908.0, + "step": 19708 + }, + { + "epoch": 2.164397100812651, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.590526580810547, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.724830150604248, + "num_tokens": 510024977.0, + "step": 19709 + }, + { + "epoch": 2.1645069185152646, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.303457260131836, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7220853567123413, + "num_tokens": 510054957.0, + "step": 19710 + }, + { + "epoch": 2.1646167362178783, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.448000431060791, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7227125763893127, + "num_tokens": 510081631.0, + "step": 19711 + }, + { + "epoch": 2.164726553920492, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5166046619415283, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7285627126693726, + "num_tokens": 510107994.0, + "step": 19712 + }, + { + "epoch": 2.164836371623106, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3673338890075684, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6902463436126709, + "num_tokens": 510136020.0, + "step": 19713 + }, + { + "epoch": 2.164946189325719, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.466568946838379, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7235749959945679, + "num_tokens": 510161525.0, + "step": 19714 + }, + { + "epoch": 2.165056007028333, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3687703609466553, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6964594721794128, + "num_tokens": 510188589.0, + "step": 19715 + }, + { + "epoch": 2.1651658247309467, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.2472944259643555, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7180155515670776, + "num_tokens": 510216497.0, + "step": 19716 + }, + { + "epoch": 2.1652756424335604, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3845012187957764, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.722545862197876, + "num_tokens": 510240815.0, + "step": 19717 + }, + { + "epoch": 2.165385460136174, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5487029552459717, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7172841429710388, + "num_tokens": 510263707.0, + "step": 19718 + }, + { + "epoch": 2.1654952778387875, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4646313190460205, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7461544275283813, + "num_tokens": 510287514.0, + "step": 19719 + }, + { + "epoch": 2.1656050955414012, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3294873237609863, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7348217964172363, + "num_tokens": 510315369.0, + "step": 19720 + }, + { + "epoch": 2.165714913244015, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.519439935684204, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7266513109207153, + "num_tokens": 510339585.0, + "step": 19721 + }, + { + "epoch": 2.1658247309466288, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.393044948577881, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7007840871810913, + "num_tokens": 510367976.0, + "step": 19722 + }, + { + "epoch": 2.165934548649242, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6224842071533203, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7357488870620728, + "num_tokens": 510389909.0, + "step": 19723 + }, + { + "epoch": 2.166044366351856, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6258411407470703, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7059577703475952, + "num_tokens": 510412666.0, + "step": 19724 + }, + { + "epoch": 2.1661541840544696, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8463189601898193, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7381268739700317, + "num_tokens": 510431793.0, + "step": 19725 + }, + { + "epoch": 2.1662640017570833, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.502845525741577, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7247015833854675, + "num_tokens": 510457084.0, + "step": 19726 + }, + { + "epoch": 2.166373819459697, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.414473295211792, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7166857123374939, + "num_tokens": 510483651.0, + "step": 19727 + }, + { + "epoch": 2.1664836371623104, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.558027982711792, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7260677218437195, + "num_tokens": 510508228.0, + "step": 19728 + }, + { + "epoch": 2.166593454864924, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3976521492004395, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.7013109922409058, + "num_tokens": 510535082.0, + "step": 19729 + }, + { + "epoch": 2.166703272567538, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.427395820617676, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7348548173904419, + "num_tokens": 510561521.0, + "step": 19730 + }, + { + "epoch": 2.1668130902701517, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.553739070892334, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7230775952339172, + "num_tokens": 510586160.0, + "step": 19731 + }, + { + "epoch": 2.166922907972765, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5246927738189697, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7218900322914124, + "num_tokens": 510611053.0, + "step": 19732 + }, + { + "epoch": 2.1670327256753787, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3592112064361572, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7057081460952759, + "num_tokens": 510639136.0, + "step": 19733 + }, + { + "epoch": 2.1671425433779925, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3334929943084717, + "learning_rate": 1e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.683414101600647, + "num_tokens": 510667618.0, + "step": 19734 + }, + { + "epoch": 2.1672523610806063, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.666841506958008, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7379263043403625, + "num_tokens": 510690341.0, + "step": 19735 + }, + { + "epoch": 2.16736217878322, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4255521297454834, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.718637228012085, + "num_tokens": 510717531.0, + "step": 19736 + }, + { + "epoch": 2.1674719964858333, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.20859956741333, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6858353018760681, + "num_tokens": 510750234.0, + "step": 19737 + }, + { + "epoch": 2.167581814188447, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.279832363128662, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7149674892425537, + "num_tokens": 510779786.0, + "step": 19738 + }, + { + "epoch": 2.167691631891061, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.596961498260498, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.723854660987854, + "num_tokens": 510802666.0, + "step": 19739 + }, + { + "epoch": 2.1678014495936746, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.589416265487671, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.716584324836731, + "num_tokens": 510825954.0, + "step": 19740 + }, + { + "epoch": 2.1679112672962884, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.7427148818969727, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7413679957389832, + "num_tokens": 510847338.0, + "step": 19741 + }, + { + "epoch": 2.1680210849989017, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.5207152366638184, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7019649147987366, + "num_tokens": 510873730.0, + "step": 19742 + }, + { + "epoch": 2.1681309027015154, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.432513952255249, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6997328400611877, + "num_tokens": 510903789.0, + "step": 19743 + }, + { + "epoch": 2.168240720404129, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.3671767711639404, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7625190019607544, + "num_tokens": 510929730.0, + "step": 19744 + }, + { + "epoch": 2.168350538106743, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4790377616882324, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7301223278045654, + "num_tokens": 510955077.0, + "step": 19745 + }, + { + "epoch": 2.1684603558093567, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.486180067062378, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7318418025970459, + "num_tokens": 510979271.0, + "step": 19746 + }, + { + "epoch": 2.16857017351197, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.0922205448150635, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6985265016555786, + "num_tokens": 511012254.0, + "step": 19747 + }, + { + "epoch": 2.1686799912145838, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.84163236618042, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7271243333816528, + "num_tokens": 511033036.0, + "step": 19748 + }, + { + "epoch": 2.1687898089171975, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4592955112457275, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7316367626190186, + "num_tokens": 511058387.0, + "step": 19749 + }, + { + "epoch": 2.1688996266198113, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.397718667984009, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7178080081939697, + "num_tokens": 511084828.0, + "step": 19750 + }, + { + "epoch": 2.1690094443224246, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.47047758102417, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7151920795440674, + "num_tokens": 511112041.0, + "step": 19751 + }, + { + "epoch": 2.1691192620250384, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.2108688354492188, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.7037737369537354, + "num_tokens": 511143885.0, + "step": 19752 + }, + { + "epoch": 2.169229079727652, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4790759086608887, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.711007833480835, + "num_tokens": 511168625.0, + "step": 19753 + }, + { + "epoch": 2.169338897430266, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4169533252716064, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.714562177658081, + "num_tokens": 511194843.0, + "step": 19754 + }, + { + "epoch": 2.1694487151328796, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6647348403930664, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7088451385498047, + "num_tokens": 511216030.0, + "step": 19755 + }, + { + "epoch": 2.169558532835493, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.465810537338257, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6993213891983032, + "num_tokens": 511241673.0, + "step": 19756 + }, + { + "epoch": 2.1696683505381067, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 7.067745685577393, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.711103618144989, + "num_tokens": 511265301.0, + "step": 19757 + }, + { + "epoch": 2.1697781682407205, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.8734419345855713, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.721544623374939, + "num_tokens": 511284891.0, + "step": 19758 + }, + { + "epoch": 2.169887985943334, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4721858501434326, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7031130194664001, + "num_tokens": 511310615.0, + "step": 19759 + }, + { + "epoch": 2.1699978036459475, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.256357192993164, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7236095666885376, + "num_tokens": 511339053.0, + "step": 19760 + }, + { + "epoch": 2.1701076213485613, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.192744731903076, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6981923580169678, + "num_tokens": 511371657.0, + "step": 19761 + }, + { + "epoch": 2.170217439051175, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.303605318069458, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7310582995414734, + "num_tokens": 511400108.0, + "step": 19762 + }, + { + "epoch": 2.170327256753789, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.511192560195923, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7434794306755066, + "num_tokens": 511422134.0, + "step": 19763 + }, + { + "epoch": 2.1704370744564025, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.1438117027282715, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.717371940612793, + "num_tokens": 511452620.0, + "step": 19764 + }, + { + "epoch": 2.170546892159016, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6010563373565674, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7182091474533081, + "num_tokens": 511477119.0, + "step": 19765 + }, + { + "epoch": 2.1706567098616296, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.3271842002868652, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7102774977684021, + "num_tokens": 511506474.0, + "step": 19766 + }, + { + "epoch": 2.1707665275642434, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.5985846519470215, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7394900321960449, + "num_tokens": 511529501.0, + "step": 19767 + }, + { + "epoch": 2.170876345266857, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.388901472091675, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7235325574874878, + "num_tokens": 511558257.0, + "step": 19768 + }, + { + "epoch": 2.170986162969471, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.5149779319763184, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7103105187416077, + "num_tokens": 511583088.0, + "step": 19769 + }, + { + "epoch": 2.171095980672084, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.2994751930236816, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7108769416809082, + "num_tokens": 511612614.0, + "step": 19770 + }, + { + "epoch": 2.171205798374698, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.697918653488159, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7268836498260498, + "num_tokens": 511633909.0, + "step": 19771 + }, + { + "epoch": 2.1713156160773117, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.484811544418335, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7076775431632996, + "num_tokens": 511659968.0, + "step": 19772 + }, + { + "epoch": 2.1714254337799255, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.654855251312256, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7230859994888306, + "num_tokens": 511682095.0, + "step": 19773 + }, + { + "epoch": 2.171535251482539, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4628615379333496, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7050970792770386, + "num_tokens": 511710990.0, + "step": 19774 + }, + { + "epoch": 2.1716450691851525, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.359290361404419, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7329422235488892, + "num_tokens": 511739624.0, + "step": 19775 + }, + { + "epoch": 2.1717548868877663, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6203718185424805, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.696213960647583, + "num_tokens": 511764140.0, + "step": 19776 + }, + { + "epoch": 2.17186470459038, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.3746259212493896, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7238081693649292, + "num_tokens": 511791977.0, + "step": 19777 + }, + { + "epoch": 2.171974522292994, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.5450682640075684, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7206289768218994, + "num_tokens": 511815727.0, + "step": 19778 + }, + { + "epoch": 2.172084339995607, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.313530445098877, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6992064118385315, + "num_tokens": 511844926.0, + "step": 19779 + }, + { + "epoch": 2.172194157698221, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.207634210586548, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.728682279586792, + "num_tokens": 511877010.0, + "step": 19780 + }, + { + "epoch": 2.1723039754008346, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.596763849258423, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7190134525299072, + "num_tokens": 511897821.0, + "step": 19781 + }, + { + "epoch": 2.1724137931034484, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.419978141784668, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7087590098381042, + "num_tokens": 511925977.0, + "step": 19782 + }, + { + "epoch": 2.1725236108060617, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4385249614715576, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7214337587356567, + "num_tokens": 511949715.0, + "step": 19783 + }, + { + "epoch": 2.1726334285086755, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.335120916366577, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7397787570953369, + "num_tokens": 511977205.0, + "step": 19784 + }, + { + "epoch": 2.1727432462112892, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.443418264389038, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.718341052532196, + "num_tokens": 512002980.0, + "step": 19785 + }, + { + "epoch": 2.172853063913903, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6589531898498535, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7313084602355957, + "num_tokens": 512023265.0, + "step": 19786 + }, + { + "epoch": 2.1729628816165167, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.311962127685547, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.718559980392456, + "num_tokens": 512049915.0, + "step": 19787 + }, + { + "epoch": 2.17307269931913, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.2589852809906006, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7168141007423401, + "num_tokens": 512078755.0, + "step": 19788 + }, + { + "epoch": 2.173182517021744, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.208047866821289, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7294964790344238, + "num_tokens": 512109347.0, + "step": 19789 + }, + { + "epoch": 2.1732923347243576, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.3267290592193604, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7235487103462219, + "num_tokens": 512137702.0, + "step": 19790 + }, + { + "epoch": 2.1734021524269713, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.403606414794922, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7217816710472107, + "num_tokens": 512163873.0, + "step": 19791 + }, + { + "epoch": 2.173511970129585, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6489498615264893, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7195812463760376, + "num_tokens": 512187004.0, + "step": 19792 + }, + { + "epoch": 2.1736217878321984, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.8939101696014404, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7276979684829712, + "num_tokens": 512206001.0, + "step": 19793 + }, + { + "epoch": 2.173731605534812, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.3012208938598633, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6998448371887207, + "num_tokens": 512239300.0, + "step": 19794 + }, + { + "epoch": 2.173841423237426, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4576539993286133, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7227246165275574, + "num_tokens": 512267649.0, + "step": 19795 + }, + { + "epoch": 2.1739512409400397, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.6957931518554688, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7537857890129089, + "num_tokens": 512287428.0, + "step": 19796 + }, + { + "epoch": 2.1740610586426534, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 2.4875707626342773, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7212628126144409, + "num_tokens": 512312435.0, + "step": 19797 + }, + { + "epoch": 2.1741708763452667, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5177032947540283, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.730480432510376, + "num_tokens": 512336657.0, + "step": 19798 + }, + { + "epoch": 2.1742806940478805, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.393505573272705, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6986624002456665, + "num_tokens": 512363372.0, + "step": 19799 + }, + { + "epoch": 2.1743905117504942, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.380891799926758, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7115646600723267, + "num_tokens": 512391054.0, + "step": 19800 + }, + { + "epoch": 2.174500329453108, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7031378746032715, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7276852130889893, + "num_tokens": 512413378.0, + "step": 19801 + }, + { + "epoch": 2.1746101471557213, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5618526935577393, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7188723087310791, + "num_tokens": 512436309.0, + "step": 19802 + }, + { + "epoch": 2.174719964858335, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6339223384857178, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7206140756607056, + "num_tokens": 512458612.0, + "step": 19803 + }, + { + "epoch": 2.174829782560949, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5410149097442627, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6996936798095703, + "num_tokens": 512483455.0, + "step": 19804 + }, + { + "epoch": 2.1749396002635626, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.4112143516540527, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7172256708145142, + "num_tokens": 512509908.0, + "step": 19805 + }, + { + "epoch": 2.1750494179661763, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.506378173828125, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7087455987930298, + "num_tokens": 512534717.0, + "step": 19806 + }, + { + "epoch": 2.1751592356687897, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.632049560546875, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7242642641067505, + "num_tokens": 512557093.0, + "step": 19807 + }, + { + "epoch": 2.1752690533714034, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4949300289154053, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7001572847366333, + "num_tokens": 512584806.0, + "step": 19808 + }, + { + "epoch": 2.175378871074017, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.53645396232605, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6943684220314026, + "num_tokens": 512610521.0, + "step": 19809 + }, + { + "epoch": 2.175488688776631, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.233520984649658, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7388834953308105, + "num_tokens": 512638067.0, + "step": 19810 + }, + { + "epoch": 2.1755985064792442, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7038583755493164, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7498476505279541, + "num_tokens": 512657400.0, + "step": 19811 + }, + { + "epoch": 2.175708324181858, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.427053213119507, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7243478298187256, + "num_tokens": 512683831.0, + "step": 19812 + }, + { + "epoch": 2.1758181418844718, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.356663942337036, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7168745398521423, + "num_tokens": 512711437.0, + "step": 19813 + }, + { + "epoch": 2.1759279595870855, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2824642658233643, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7056835889816284, + "num_tokens": 512739807.0, + "step": 19814 + }, + { + "epoch": 2.1760377772896993, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1052188873291016, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7277743220329285, + "num_tokens": 512772090.0, + "step": 19815 + }, + { + "epoch": 2.1761475949923126, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6821091175079346, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7089651823043823, + "num_tokens": 512796315.0, + "step": 19816 + }, + { + "epoch": 2.1762574126949263, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.401940107345581, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6992013454437256, + "num_tokens": 512823980.0, + "step": 19817 + }, + { + "epoch": 2.17636723039754, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2231028079986572, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7308074235916138, + "num_tokens": 512853809.0, + "step": 19818 + }, + { + "epoch": 2.176477048100154, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6321163177490234, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7295559048652649, + "num_tokens": 512877992.0, + "step": 19819 + }, + { + "epoch": 2.1765868658027676, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5804941654205322, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7300818562507629, + "num_tokens": 512900783.0, + "step": 19820 + }, + { + "epoch": 2.176696683505381, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.40690016746521, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7145648002624512, + "num_tokens": 512926616.0, + "step": 19821 + }, + { + "epoch": 2.1768065012079947, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.583303689956665, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7390002012252808, + "num_tokens": 512950669.0, + "step": 19822 + }, + { + "epoch": 2.1769163189106084, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.46641206741333, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7057428359985352, + "num_tokens": 512975107.0, + "step": 19823 + }, + { + "epoch": 2.177026136613222, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.32907772064209, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7338984608650208, + "num_tokens": 513001713.0, + "step": 19824 + }, + { + "epoch": 2.1771359543158355, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6628739833831787, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6942533254623413, + "num_tokens": 513026129.0, + "step": 19825 + }, + { + "epoch": 2.1772457720184493, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4748895168304443, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7383060455322266, + "num_tokens": 513049616.0, + "step": 19826 + }, + { + "epoch": 2.177355589721063, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4397947788238525, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7073981761932373, + "num_tokens": 513076361.0, + "step": 19827 + }, + { + "epoch": 2.177465407423677, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.346517324447632, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7117807865142822, + "num_tokens": 513103893.0, + "step": 19828 + }, + { + "epoch": 2.1775752251262905, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6909353733062744, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7131904363632202, + "num_tokens": 513125256.0, + "step": 19829 + }, + { + "epoch": 2.177685042828904, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.361386775970459, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6884540319442749, + "num_tokens": 513152505.0, + "step": 19830 + }, + { + "epoch": 2.1777948605315176, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.110746145248413, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7277116775512695, + "num_tokens": 513182326.0, + "step": 19831 + }, + { + "epoch": 2.1779046782341314, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.523345947265625, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7436087131500244, + "num_tokens": 513204562.0, + "step": 19832 + }, + { + "epoch": 2.178014495936745, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.189175605773926, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7451449036598206, + "num_tokens": 513231469.0, + "step": 19833 + }, + { + "epoch": 2.1781243136393584, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1866798400878906, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.724550724029541, + "num_tokens": 513260543.0, + "step": 19834 + }, + { + "epoch": 2.178234131341972, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6029958724975586, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7362545132637024, + "num_tokens": 513283046.0, + "step": 19835 + }, + { + "epoch": 2.178343949044586, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4367592334747314, + "learning_rate": 1e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.6794008612632751, + "num_tokens": 513309369.0, + "step": 19836 + }, + { + "epoch": 2.1784537667471997, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4762609004974365, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6829569935798645, + "num_tokens": 513337836.0, + "step": 19837 + }, + { + "epoch": 2.1785635844498135, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1986711025238037, + "learning_rate": 1e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6772273778915405, + "num_tokens": 513370115.0, + "step": 19838 + }, + { + "epoch": 2.1786734021524268, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3870275020599365, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7056956887245178, + "num_tokens": 513398324.0, + "step": 19839 + }, + { + "epoch": 2.1787832198550405, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.388244390487671, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7154654264450073, + "num_tokens": 513427101.0, + "step": 19840 + }, + { + "epoch": 2.1788930375576543, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5448992252349854, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7350074052810669, + "num_tokens": 513450935.0, + "step": 19841 + }, + { + "epoch": 2.179002855260268, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.541184902191162, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7117892503738403, + "num_tokens": 513476918.0, + "step": 19842 + }, + { + "epoch": 2.179112672962882, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.291987895965576, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7151739001274109, + "num_tokens": 513508803.0, + "step": 19843 + }, + { + "epoch": 2.179222490665495, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5999345779418945, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7133804559707642, + "num_tokens": 513533374.0, + "step": 19844 + }, + { + "epoch": 2.179332308368109, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5264813899993896, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7269605398178101, + "num_tokens": 513557068.0, + "step": 19845 + }, + { + "epoch": 2.1794421260707226, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.57120943069458, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.724555253982544, + "num_tokens": 513579110.0, + "step": 19846 + }, + { + "epoch": 2.1795519437733364, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4111862182617188, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7112001180648804, + "num_tokens": 513604626.0, + "step": 19847 + }, + { + "epoch": 2.17966176147595, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4260010719299316, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.7017449736595154, + "num_tokens": 513630768.0, + "step": 19848 + }, + { + "epoch": 2.1797715791785635, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4095675945281982, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7294342517852783, + "num_tokens": 513656694.0, + "step": 19849 + }, + { + "epoch": 2.179881396881177, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.307027578353882, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.7030737400054932, + "num_tokens": 513685769.0, + "step": 19850 + }, + { + "epoch": 2.179991214583791, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1409547328948975, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6908649206161499, + "num_tokens": 513719014.0, + "step": 19851 + }, + { + "epoch": 2.1801010322864047, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.511765956878662, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7050557732582092, + "num_tokens": 513745490.0, + "step": 19852 + }, + { + "epoch": 2.180210849989018, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3295865058898926, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7054681777954102, + "num_tokens": 513774431.0, + "step": 19853 + }, + { + "epoch": 2.180320667691632, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3480215072631836, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7158336639404297, + "num_tokens": 513799223.0, + "step": 19854 + }, + { + "epoch": 2.1804304853942456, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.425248384475708, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7079851627349854, + "num_tokens": 513826121.0, + "step": 19855 + }, + { + "epoch": 2.1805403030968593, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2952418327331543, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7050027847290039, + "num_tokens": 513857479.0, + "step": 19856 + }, + { + "epoch": 2.180650120799473, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9653592109680176, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.704566478729248, + "num_tokens": 513878498.0, + "step": 19857 + }, + { + "epoch": 2.1807599385020864, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7100753784179688, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7204986214637756, + "num_tokens": 513899898.0, + "step": 19858 + }, + { + "epoch": 2.1808697562047, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2794394493103027, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7045612335205078, + "num_tokens": 513929478.0, + "step": 19859 + }, + { + "epoch": 2.180979573907314, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 6.997859954833984, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7114828824996948, + "num_tokens": 513956905.0, + "step": 19860 + }, + { + "epoch": 2.1810893916099277, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.386733055114746, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7284227609634399, + "num_tokens": 513984325.0, + "step": 19861 + }, + { + "epoch": 2.181199209312541, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3912322521209717, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6938369274139404, + "num_tokens": 514013885.0, + "step": 19862 + }, + { + "epoch": 2.1813090270151547, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1679093837738037, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7100879549980164, + "num_tokens": 514045712.0, + "step": 19863 + }, + { + "epoch": 2.1814188447177685, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6921441555023193, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7197155356407166, + "num_tokens": 514067777.0, + "step": 19864 + }, + { + "epoch": 2.1815286624203822, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 3.009626626968384, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7405475378036499, + "num_tokens": 514084239.0, + "step": 19865 + }, + { + "epoch": 2.181638480122996, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.382270336151123, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7010983824729919, + "num_tokens": 514111071.0, + "step": 19866 + }, + { + "epoch": 2.1817482978256093, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3970718383789062, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7276835441589355, + "num_tokens": 514139515.0, + "step": 19867 + }, + { + "epoch": 2.181858115528223, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.42594575881958, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7021321058273315, + "num_tokens": 514166181.0, + "step": 19868 + }, + { + "epoch": 2.181967933230837, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.231912612915039, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7269222736358643, + "num_tokens": 514194972.0, + "step": 19869 + }, + { + "epoch": 2.1820777509334506, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3698108196258545, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7038432359695435, + "num_tokens": 514224534.0, + "step": 19870 + }, + { + "epoch": 2.1821875686360643, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.19168758392334, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7282154560089111, + "num_tokens": 514256824.0, + "step": 19871 + }, + { + "epoch": 2.1822973863386776, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2725067138671875, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.723362386226654, + "num_tokens": 514284891.0, + "step": 19872 + }, + { + "epoch": 2.1824072040412914, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 4.116331100463867, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7312495708465576, + "num_tokens": 514306297.0, + "step": 19873 + }, + { + "epoch": 2.182517021743905, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.400461196899414, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7241092920303345, + "num_tokens": 514331575.0, + "step": 19874 + }, + { + "epoch": 2.182626839446519, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5365991592407227, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7154185771942139, + "num_tokens": 514355046.0, + "step": 19875 + }, + { + "epoch": 2.1827366571491327, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.435189723968506, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7184812426567078, + "num_tokens": 514381567.0, + "step": 19876 + }, + { + "epoch": 2.182846474851746, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6024868488311768, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7268236875534058, + "num_tokens": 514404830.0, + "step": 19877 + }, + { + "epoch": 2.1829562925543597, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.350459575653076, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7230328917503357, + "num_tokens": 514434722.0, + "step": 19878 + }, + { + "epoch": 2.1830661102569735, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.623382329940796, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7040957808494568, + "num_tokens": 514457611.0, + "step": 19879 + }, + { + "epoch": 2.1831759279595873, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.924638271331787, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7276639938354492, + "num_tokens": 514477830.0, + "step": 19880 + }, + { + "epoch": 2.1832857456622006, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3237195014953613, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7270452976226807, + "num_tokens": 514503871.0, + "step": 19881 + }, + { + "epoch": 2.1833955633648143, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.65262770652771, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7157046794891357, + "num_tokens": 514525892.0, + "step": 19882 + }, + { + "epoch": 2.183505381067428, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.217986583709717, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7043668627738953, + "num_tokens": 514557113.0, + "step": 19883 + }, + { + "epoch": 2.183615198770042, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.380572557449341, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7217038869857788, + "num_tokens": 514588219.0, + "step": 19884 + }, + { + "epoch": 2.1837250164726556, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.378281593322754, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7250486612319946, + "num_tokens": 514615084.0, + "step": 19885 + }, + { + "epoch": 2.183834834175269, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6394567489624023, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7208379507064819, + "num_tokens": 514639856.0, + "step": 19886 + }, + { + "epoch": 2.1839446518778827, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8809878826141357, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7315759658813477, + "num_tokens": 514659269.0, + "step": 19887 + }, + { + "epoch": 2.1840544695804964, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.557985782623291, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7136948108673096, + "num_tokens": 514682620.0, + "step": 19888 + }, + { + "epoch": 2.18416428728311, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6448655128479004, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7034532427787781, + "num_tokens": 514705976.0, + "step": 19889 + }, + { + "epoch": 2.1842741049857235, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.6359899044036865, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7423043251037598, + "num_tokens": 514727483.0, + "step": 19890 + }, + { + "epoch": 2.1843839226883373, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 4.012836933135986, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7131495475769043, + "num_tokens": 514750353.0, + "step": 19891 + }, + { + "epoch": 2.184493740390951, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4151577949523926, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7061949968338013, + "num_tokens": 514776854.0, + "step": 19892 + }, + { + "epoch": 2.1846035580935648, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.384167432785034, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7144148349761963, + "num_tokens": 514809145.0, + "step": 19893 + }, + { + "epoch": 2.1847133757961785, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4009478092193604, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7142729163169861, + "num_tokens": 514837614.0, + "step": 19894 + }, + { + "epoch": 2.184823193498792, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.577183723449707, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6853533387184143, + "num_tokens": 514863122.0, + "step": 19895 + }, + { + "epoch": 2.1849330112014056, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6607863903045654, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.733238160610199, + "num_tokens": 514884835.0, + "step": 19896 + }, + { + "epoch": 2.1850428289040194, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.539675712585449, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7227209210395813, + "num_tokens": 514910027.0, + "step": 19897 + }, + { + "epoch": 2.185152646606633, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.603588342666626, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7439116835594177, + "num_tokens": 514934175.0, + "step": 19898 + }, + { + "epoch": 2.185262464309247, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.691178321838379, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7114487290382385, + "num_tokens": 514956012.0, + "step": 19899 + }, + { + "epoch": 2.18537228201186, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4057557582855225, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7189391851425171, + "num_tokens": 514982460.0, + "step": 19900 + }, + { + "epoch": 2.185482099714474, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.67478346824646, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7334489822387695, + "num_tokens": 515005156.0, + "step": 19901 + }, + { + "epoch": 2.1855919174170877, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5586321353912354, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7070372700691223, + "num_tokens": 515030699.0, + "step": 19902 + }, + { + "epoch": 2.1857017351197014, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4688918590545654, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7151920199394226, + "num_tokens": 515055876.0, + "step": 19903 + }, + { + "epoch": 2.1858115528223148, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.311901092529297, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7003277540206909, + "num_tokens": 515086888.0, + "step": 19904 + }, + { + "epoch": 2.1859213705249285, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2919914722442627, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7230814099311829, + "num_tokens": 515115200.0, + "step": 19905 + }, + { + "epoch": 2.1860311882275423, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3617799282073975, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7160451412200928, + "num_tokens": 515141586.0, + "step": 19906 + }, + { + "epoch": 2.186141005930156, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4001212120056152, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7190055847167969, + "num_tokens": 515167298.0, + "step": 19907 + }, + { + "epoch": 2.18625082363277, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5607872009277344, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7188121676445007, + "num_tokens": 515190325.0, + "step": 19908 + }, + { + "epoch": 2.186360641335383, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.499797821044922, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7293221950531006, + "num_tokens": 515215158.0, + "step": 19909 + }, + { + "epoch": 2.186470459037997, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2568435668945312, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7268615961074829, + "num_tokens": 515244882.0, + "step": 19910 + }, + { + "epoch": 2.1865802767406106, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.747098445892334, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7244699001312256, + "num_tokens": 515265951.0, + "step": 19911 + }, + { + "epoch": 2.1866900944432244, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.689784288406372, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7127022743225098, + "num_tokens": 515287808.0, + "step": 19912 + }, + { + "epoch": 2.1867999121458377, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6015591621398926, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7199453115463257, + "num_tokens": 515312532.0, + "step": 19913 + }, + { + "epoch": 2.1869097298484514, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9860804080963135, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7313349843025208, + "num_tokens": 515330522.0, + "step": 19914 + }, + { + "epoch": 2.187019547551065, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6815011501312256, + "learning_rate": 1e-06, + "loss": 1.0945, + "mean_token_accuracy": 0.6926285028457642, + "num_tokens": 515353629.0, + "step": 19915 + }, + { + "epoch": 2.187129365253679, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.640185832977295, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7239428758621216, + "num_tokens": 515377622.0, + "step": 19916 + }, + { + "epoch": 2.1872391829562927, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.500063896179199, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7099984884262085, + "num_tokens": 515402491.0, + "step": 19917 + }, + { + "epoch": 2.187349000658906, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4527699947357178, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7385075092315674, + "num_tokens": 515427669.0, + "step": 19918 + }, + { + "epoch": 2.18745881836152, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.323237895965576, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6892673373222351, + "num_tokens": 515455641.0, + "step": 19919 + }, + { + "epoch": 2.1875686360641335, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4794833660125732, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7274552583694458, + "num_tokens": 515480433.0, + "step": 19920 + }, + { + "epoch": 2.1876784537667473, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.540315628051758, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7154513597488403, + "num_tokens": 515506270.0, + "step": 19921 + }, + { + "epoch": 2.187788271469361, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5231077671051025, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.714919924736023, + "num_tokens": 515532306.0, + "step": 19922 + }, + { + "epoch": 2.1878980891719744, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4233710765838623, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7047524452209473, + "num_tokens": 515556984.0, + "step": 19923 + }, + { + "epoch": 2.188007906874588, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5562689304351807, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7129715085029602, + "num_tokens": 515581961.0, + "step": 19924 + }, + { + "epoch": 2.188117724577202, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.913132905960083, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7250247001647949, + "num_tokens": 515599473.0, + "step": 19925 + }, + { + "epoch": 2.1882275422798156, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.43203067779541, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6984115242958069, + "num_tokens": 515623550.0, + "step": 19926 + }, + { + "epoch": 2.1883373599824294, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5168466567993164, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7468314170837402, + "num_tokens": 515647512.0, + "step": 19927 + }, + { + "epoch": 2.1884471776850427, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.798353433609009, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7445493936538696, + "num_tokens": 515666805.0, + "step": 19928 + }, + { + "epoch": 2.1885569953876565, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.716005802154541, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7256534099578857, + "num_tokens": 515691097.0, + "step": 19929 + }, + { + "epoch": 2.1886668130902702, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.241553783416748, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6873722672462463, + "num_tokens": 515721818.0, + "step": 19930 + }, + { + "epoch": 2.188776630792884, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.200673818588257, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6973282694816589, + "num_tokens": 515754030.0, + "step": 19931 + }, + { + "epoch": 2.1888864484954973, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.371506929397583, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7188997268676758, + "num_tokens": 515780601.0, + "step": 19932 + }, + { + "epoch": 2.188996266198111, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5155200958251953, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.72163325548172, + "num_tokens": 515803712.0, + "step": 19933 + }, + { + "epoch": 2.189106083900725, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.568476438522339, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7359440326690674, + "num_tokens": 515826421.0, + "step": 19934 + }, + { + "epoch": 2.1892159016033386, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.59260630607605, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6999843716621399, + "num_tokens": 515852185.0, + "step": 19935 + }, + { + "epoch": 2.1893257193059523, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.374061107635498, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7047029733657837, + "num_tokens": 515880345.0, + "step": 19936 + }, + { + "epoch": 2.1894355370085656, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2777180671691895, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7137528657913208, + "num_tokens": 515909406.0, + "step": 19937 + }, + { + "epoch": 2.1895453547111794, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.251920223236084, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.720723032951355, + "num_tokens": 515938440.0, + "step": 19938 + }, + { + "epoch": 2.189655172413793, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.670990228652954, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.715171754360199, + "num_tokens": 515960058.0, + "step": 19939 + }, + { + "epoch": 2.189764990116407, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.816181182861328, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.720391571521759, + "num_tokens": 515980139.0, + "step": 19940 + }, + { + "epoch": 2.18987480781902, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.675431251525879, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.717947244644165, + "num_tokens": 516000716.0, + "step": 19941 + }, + { + "epoch": 2.189984625521634, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9480361938476562, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.726982593536377, + "num_tokens": 516019694.0, + "step": 19942 + }, + { + "epoch": 2.1900944432242477, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3261961936950684, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6993544697761536, + "num_tokens": 516049134.0, + "step": 19943 + }, + { + "epoch": 2.1902042609268615, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.370915651321411, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7067921757698059, + "num_tokens": 516076668.0, + "step": 19944 + }, + { + "epoch": 2.1903140786294752, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5383455753326416, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7192392349243164, + "num_tokens": 516101344.0, + "step": 19945 + }, + { + "epoch": 2.1904238963320886, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.467547655105591, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7010000944137573, + "num_tokens": 516127699.0, + "step": 19946 + }, + { + "epoch": 2.1905337140347023, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4470603466033936, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7277805805206299, + "num_tokens": 516153496.0, + "step": 19947 + }, + { + "epoch": 2.190643531737316, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7593612670898438, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6943652033805847, + "num_tokens": 516179007.0, + "step": 19948 + }, + { + "epoch": 2.19075334943993, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5345771312713623, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7129011154174805, + "num_tokens": 516203928.0, + "step": 19949 + }, + { + "epoch": 2.1908631671425436, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6531200408935547, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7230291366577148, + "num_tokens": 516227878.0, + "step": 19950 + }, + { + "epoch": 2.190972984845157, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.358205795288086, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7084475159645081, + "num_tokens": 516254967.0, + "step": 19951 + }, + { + "epoch": 2.1910828025477707, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.528735637664795, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7383326292037964, + "num_tokens": 516277432.0, + "step": 19952 + }, + { + "epoch": 2.1911926202503844, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8152921199798584, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7041572332382202, + "num_tokens": 516300329.0, + "step": 19953 + }, + { + "epoch": 2.191302437952998, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3619065284729004, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6964483857154846, + "num_tokens": 516328649.0, + "step": 19954 + }, + { + "epoch": 2.1914122556556115, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.441603660583496, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7073918581008911, + "num_tokens": 516353113.0, + "step": 19955 + }, + { + "epoch": 2.1915220733582252, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1761651039123535, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7237082719802856, + "num_tokens": 516383463.0, + "step": 19956 + }, + { + "epoch": 2.191631891060839, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.395753860473633, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7349492311477661, + "num_tokens": 516407754.0, + "step": 19957 + }, + { + "epoch": 2.1917417087634528, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.621969223022461, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7318599820137024, + "num_tokens": 516428975.0, + "step": 19958 + }, + { + "epoch": 2.1918515264660665, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.941725730895996, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7307928800582886, + "num_tokens": 516448000.0, + "step": 19959 + }, + { + "epoch": 2.19196134416868, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.303299903869629, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6972883343696594, + "num_tokens": 516475023.0, + "step": 19960 + }, + { + "epoch": 2.1920711618712936, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4342596530914307, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7237259745597839, + "num_tokens": 516501229.0, + "step": 19961 + }, + { + "epoch": 2.1921809795739073, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.301516056060791, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7172008752822876, + "num_tokens": 516532095.0, + "step": 19962 + }, + { + "epoch": 2.192290797276521, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.558196783065796, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7159065008163452, + "num_tokens": 516555590.0, + "step": 19963 + }, + { + "epoch": 2.1924006149791344, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.284266233444214, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7087472677230835, + "num_tokens": 516586018.0, + "step": 19964 + }, + { + "epoch": 2.192510432681748, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5785574913024902, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7045340538024902, + "num_tokens": 516610101.0, + "step": 19965 + }, + { + "epoch": 2.192620250384362, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4833881855010986, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7155260443687439, + "num_tokens": 516634371.0, + "step": 19966 + }, + { + "epoch": 2.1927300680869757, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6017682552337646, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7321425676345825, + "num_tokens": 516659537.0, + "step": 19967 + }, + { + "epoch": 2.1928398857895894, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5316174030303955, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7202805280685425, + "num_tokens": 516684554.0, + "step": 19968 + }, + { + "epoch": 2.1929497034922028, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7317750453948975, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7359294891357422, + "num_tokens": 516706127.0, + "step": 19969 + }, + { + "epoch": 2.1930595211948165, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.527139663696289, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7191063165664673, + "num_tokens": 516729918.0, + "step": 19970 + }, + { + "epoch": 2.1931693388974303, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.438894033432007, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7311303615570068, + "num_tokens": 516754609.0, + "step": 19971 + }, + { + "epoch": 2.193279156600044, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5673091411590576, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.700783371925354, + "num_tokens": 516779618.0, + "step": 19972 + }, + { + "epoch": 2.193388974302658, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4653618335723877, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7246449589729309, + "num_tokens": 516803725.0, + "step": 19973 + }, + { + "epoch": 2.193498792005271, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3283743858337402, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7370250225067139, + "num_tokens": 516829951.0, + "step": 19974 + }, + { + "epoch": 2.193608609707885, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2471466064453125, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6899747848510742, + "num_tokens": 516861823.0, + "step": 19975 + }, + { + "epoch": 2.1937184274104986, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3592443466186523, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7073132991790771, + "num_tokens": 516888694.0, + "step": 19976 + }, + { + "epoch": 2.1938282451131124, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.36094331741333, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7371965646743774, + "num_tokens": 516913859.0, + "step": 19977 + }, + { + "epoch": 2.193938062815726, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7922463417053223, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7175537943840027, + "num_tokens": 516934106.0, + "step": 19978 + }, + { + "epoch": 2.1940478805183394, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2758688926696777, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7265348434448242, + "num_tokens": 516962220.0, + "step": 19979 + }, + { + "epoch": 2.194157698220953, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3466556072235107, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7100845575332642, + "num_tokens": 516992097.0, + "step": 19980 + }, + { + "epoch": 2.194267515923567, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.107243537902832, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7168853282928467, + "num_tokens": 517024985.0, + "step": 19981 + }, + { + "epoch": 2.1943773336261807, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5363857746124268, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7137717604637146, + "num_tokens": 517049686.0, + "step": 19982 + }, + { + "epoch": 2.194487151328794, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.618756055831909, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7166824340820312, + "num_tokens": 517073345.0, + "step": 19983 + }, + { + "epoch": 2.1945969690314078, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3939132690429688, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7197990417480469, + "num_tokens": 517099659.0, + "step": 19984 + }, + { + "epoch": 2.1947067867340215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.234238862991333, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.7078126668930054, + "num_tokens": 517127778.0, + "step": 19985 + }, + { + "epoch": 2.1948166044366353, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6854867935180664, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7486855983734131, + "num_tokens": 517147665.0, + "step": 19986 + }, + { + "epoch": 2.194926422139249, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3664309978485107, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7041597962379456, + "num_tokens": 517173900.0, + "step": 19987 + }, + { + "epoch": 2.1950362398418624, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3798434734344482, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7238647937774658, + "num_tokens": 517201170.0, + "step": 19988 + }, + { + "epoch": 2.195146057544476, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3579137325286865, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6781463623046875, + "num_tokens": 517230427.0, + "step": 19989 + }, + { + "epoch": 2.19525587524709, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.295243501663208, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6985585689544678, + "num_tokens": 517265792.0, + "step": 19990 + }, + { + "epoch": 2.1953656929497036, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.651035785675049, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7214162945747375, + "num_tokens": 517287592.0, + "step": 19991 + }, + { + "epoch": 2.195475510652317, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.548619508743286, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7339800596237183, + "num_tokens": 517310886.0, + "step": 19992 + }, + { + "epoch": 2.1955853283549307, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6610605716705322, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7290074825286865, + "num_tokens": 517332572.0, + "step": 19993 + }, + { + "epoch": 2.1956951460575445, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6599857807159424, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7180237174034119, + "num_tokens": 517355634.0, + "step": 19994 + }, + { + "epoch": 2.195804963760158, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3581900596618652, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6867145299911499, + "num_tokens": 517385468.0, + "step": 19995 + }, + { + "epoch": 2.195914781462772, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.176297426223755, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7072041630744934, + "num_tokens": 517417204.0, + "step": 19996 + }, + { + "epoch": 2.1960245991653853, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5288617610931396, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7438157200813293, + "num_tokens": 517441033.0, + "step": 19997 + }, + { + "epoch": 2.196134416867999, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.423804759979248, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6967985033988953, + "num_tokens": 517469258.0, + "step": 19998 + }, + { + "epoch": 2.196244234570613, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5013082027435303, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.693247377872467, + "num_tokens": 517495697.0, + "step": 19999 + }, + { + "epoch": 2.1963540522732266, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1910440921783447, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7185286283493042, + "num_tokens": 517526328.0, + "step": 20000 + }, + { + "epoch": 2.1964638699758403, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5416951179504395, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6897441148757935, + "num_tokens": 517556477.0, + "step": 20001 + }, + { + "epoch": 2.1965736876784536, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3942668437957764, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7116438150405884, + "num_tokens": 517583923.0, + "step": 20002 + }, + { + "epoch": 2.1966835053810674, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.155447483062744, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6965094804763794, + "num_tokens": 517618071.0, + "step": 20003 + }, + { + "epoch": 2.196793323083681, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.654543876647949, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7467470169067383, + "num_tokens": 517640507.0, + "step": 20004 + }, + { + "epoch": 2.196903140786295, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5223538875579834, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7245351076126099, + "num_tokens": 517665619.0, + "step": 20005 + }, + { + "epoch": 2.197012958488908, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2436468601226807, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7291397452354431, + "num_tokens": 517693362.0, + "step": 20006 + }, + { + "epoch": 2.197122776191522, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7628066539764404, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7158834934234619, + "num_tokens": 517716288.0, + "step": 20007 + }, + { + "epoch": 2.1972325938941357, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4907584190368652, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6984491348266602, + "num_tokens": 517741774.0, + "step": 20008 + }, + { + "epoch": 2.1973424115967495, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2132060527801514, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7018396854400635, + "num_tokens": 517773664.0, + "step": 20009 + }, + { + "epoch": 2.1974522292993632, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6853108406066895, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7244082689285278, + "num_tokens": 517793926.0, + "step": 20010 + }, + { + "epoch": 2.1975620470019765, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2978265285491943, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7043928503990173, + "num_tokens": 517823769.0, + "step": 20011 + }, + { + "epoch": 2.1976718647045903, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5449891090393066, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7183710932731628, + "num_tokens": 517849202.0, + "step": 20012 + }, + { + "epoch": 2.197781682407204, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3979482650756836, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7076579332351685, + "num_tokens": 517877265.0, + "step": 20013 + }, + { + "epoch": 2.197891500109818, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5042226314544678, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7201721668243408, + "num_tokens": 517900377.0, + "step": 20014 + }, + { + "epoch": 2.198001317812431, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2245373725891113, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.740045428276062, + "num_tokens": 517927383.0, + "step": 20015 + }, + { + "epoch": 2.198111135515045, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.305215835571289, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6939839124679565, + "num_tokens": 517957042.0, + "step": 20016 + }, + { + "epoch": 2.1982209532176586, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4394948482513428, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7054522633552551, + "num_tokens": 517982093.0, + "step": 20017 + }, + { + "epoch": 2.1983307709202724, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.379714012145996, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7368297576904297, + "num_tokens": 518006726.0, + "step": 20018 + }, + { + "epoch": 2.198440588622886, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3975088596343994, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7123931646347046, + "num_tokens": 518038182.0, + "step": 20019 + }, + { + "epoch": 2.1985504063254995, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4716029167175293, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7426674365997314, + "num_tokens": 518061637.0, + "step": 20020 + }, + { + "epoch": 2.1986602240281132, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.310570478439331, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7060937881469727, + "num_tokens": 518091519.0, + "step": 20021 + }, + { + "epoch": 2.198770041730727, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4747672080993652, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6949236989021301, + "num_tokens": 518116665.0, + "step": 20022 + }, + { + "epoch": 2.1988798594333407, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.182166576385498, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7027379870414734, + "num_tokens": 518151433.0, + "step": 20023 + }, + { + "epoch": 2.1989896771359545, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8671936988830566, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6967271566390991, + "num_tokens": 518172320.0, + "step": 20024 + }, + { + "epoch": 2.199099494838568, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7446603775024414, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7491767406463623, + "num_tokens": 518192686.0, + "step": 20025 + }, + { + "epoch": 2.1992093125411816, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.52893328666687, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7010285258293152, + "num_tokens": 518219581.0, + "step": 20026 + }, + { + "epoch": 2.1993191302437953, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3183863162994385, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7142142653465271, + "num_tokens": 518247489.0, + "step": 20027 + }, + { + "epoch": 2.199428947946409, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7874937057495117, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7234961986541748, + "num_tokens": 518269276.0, + "step": 20028 + }, + { + "epoch": 2.199538765649023, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.386430025100708, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7313401699066162, + "num_tokens": 518297160.0, + "step": 20029 + }, + { + "epoch": 2.199648583351636, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4317049980163574, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6977787017822266, + "num_tokens": 518324758.0, + "step": 20030 + }, + { + "epoch": 2.19975840105425, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 8.539497375488281, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7291606664657593, + "num_tokens": 518351349.0, + "step": 20031 + }, + { + "epoch": 2.1998682187568637, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4306392669677734, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.72725909948349, + "num_tokens": 518379043.0, + "step": 20032 + }, + { + "epoch": 2.1999780364594774, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3930606842041016, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7100157737731934, + "num_tokens": 518408717.0, + "step": 20033 + }, + { + "epoch": 2.2000878541620907, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7390997409820557, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7098706960678101, + "num_tokens": 518433853.0, + "step": 20034 + }, + { + "epoch": 2.2001976718647045, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6292903423309326, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.701673686504364, + "num_tokens": 518457724.0, + "step": 20035 + }, + { + "epoch": 2.2003074895673183, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2621521949768066, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7384286522865295, + "num_tokens": 518485575.0, + "step": 20036 + }, + { + "epoch": 2.200417307269932, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.349156618118286, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6991760730743408, + "num_tokens": 518517005.0, + "step": 20037 + }, + { + "epoch": 2.2005271249725458, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6503183841705322, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7008735537528992, + "num_tokens": 518541236.0, + "step": 20038 + }, + { + "epoch": 2.200636942675159, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6077775955200195, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7231320142745972, + "num_tokens": 518565424.0, + "step": 20039 + }, + { + "epoch": 2.200746760377773, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.384620428085327, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7345799207687378, + "num_tokens": 518589389.0, + "step": 20040 + }, + { + "epoch": 2.2008565780803866, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4326696395874023, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7138984203338623, + "num_tokens": 518614095.0, + "step": 20041 + }, + { + "epoch": 2.2009663957830004, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.475545883178711, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7126502394676208, + "num_tokens": 518638191.0, + "step": 20042 + }, + { + "epoch": 2.2010762134856137, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.562741279602051, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7186667323112488, + "num_tokens": 518660484.0, + "step": 20043 + }, + { + "epoch": 2.2011860311882274, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3989782333374023, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6993607878684998, + "num_tokens": 518687926.0, + "step": 20044 + }, + { + "epoch": 2.201295848890841, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2786083221435547, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7000011205673218, + "num_tokens": 518719279.0, + "step": 20045 + }, + { + "epoch": 2.201405666593455, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2484192848205566, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.7051330804824829, + "num_tokens": 518753491.0, + "step": 20046 + }, + { + "epoch": 2.2015154842960687, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4449527263641357, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7065759897232056, + "num_tokens": 518780926.0, + "step": 20047 + }, + { + "epoch": 2.201625301998682, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6239147186279297, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6928431391716003, + "num_tokens": 518805478.0, + "step": 20048 + }, + { + "epoch": 2.2017351197012958, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5015883445739746, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7185560464859009, + "num_tokens": 518830675.0, + "step": 20049 + }, + { + "epoch": 2.2018449374039095, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4760775566101074, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7341291904449463, + "num_tokens": 518854521.0, + "step": 20050 + }, + { + "epoch": 2.2019547551065233, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.320676803588867, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7061532735824585, + "num_tokens": 518882182.0, + "step": 20051 + }, + { + "epoch": 2.202064572809137, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 7.148942470550537, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7147482633590698, + "num_tokens": 518911153.0, + "step": 20052 + }, + { + "epoch": 2.2021743905117503, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5912530422210693, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7057229280471802, + "num_tokens": 518938259.0, + "step": 20053 + }, + { + "epoch": 2.202284208214364, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.543368101119995, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7533674240112305, + "num_tokens": 518960702.0, + "step": 20054 + }, + { + "epoch": 2.202394025916978, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.358809232711792, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6978961229324341, + "num_tokens": 518989298.0, + "step": 20055 + }, + { + "epoch": 2.2025038436195916, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4568142890930176, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6962960958480835, + "num_tokens": 519014972.0, + "step": 20056 + }, + { + "epoch": 2.2026136613222054, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.626713991165161, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7243247628211975, + "num_tokens": 519038925.0, + "step": 20057 + }, + { + "epoch": 2.2027234790248187, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.432217836380005, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7154717445373535, + "num_tokens": 519064807.0, + "step": 20058 + }, + { + "epoch": 2.2028332967274324, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.577627420425415, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.743087112903595, + "num_tokens": 519086927.0, + "step": 20059 + }, + { + "epoch": 2.202943114430046, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4141664505004883, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7423217296600342, + "num_tokens": 519111766.0, + "step": 20060 + }, + { + "epoch": 2.20305293213266, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.304835557937622, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6995229721069336, + "num_tokens": 519140981.0, + "step": 20061 + }, + { + "epoch": 2.2031627498352733, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4980032444000244, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6999928951263428, + "num_tokens": 519165772.0, + "step": 20062 + }, + { + "epoch": 2.203272567537887, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.431649923324585, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7332018613815308, + "num_tokens": 519191829.0, + "step": 20063 + }, + { + "epoch": 2.203382385240501, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.529465675354004, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6964428424835205, + "num_tokens": 519216179.0, + "step": 20064 + }, + { + "epoch": 2.2034922029431145, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2283036708831787, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7366814613342285, + "num_tokens": 519245075.0, + "step": 20065 + }, + { + "epoch": 2.2036020206457283, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.338685989379883, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7173696756362915, + "num_tokens": 519272624.0, + "step": 20066 + }, + { + "epoch": 2.2037118383483416, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4387104511260986, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7161659598350525, + "num_tokens": 519297407.0, + "step": 20067 + }, + { + "epoch": 2.2038216560509554, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.124020576477051, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6913883090019226, + "num_tokens": 519335419.0, + "step": 20068 + }, + { + "epoch": 2.203931473753569, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.72296404838562, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7366464138031006, + "num_tokens": 519355629.0, + "step": 20069 + }, + { + "epoch": 2.204041291456183, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6056811809539795, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7183388471603394, + "num_tokens": 519381439.0, + "step": 20070 + }, + { + "epoch": 2.204151109158796, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.564347267150879, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7070786952972412, + "num_tokens": 519404641.0, + "step": 20071 + }, + { + "epoch": 2.20426092686141, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2916195392608643, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7080230116844177, + "num_tokens": 519432333.0, + "step": 20072 + }, + { + "epoch": 2.2043707445640237, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.310392141342163, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.704656720161438, + "num_tokens": 519462964.0, + "step": 20073 + }, + { + "epoch": 2.2044805622666375, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4983012676239014, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7209765911102295, + "num_tokens": 519487752.0, + "step": 20074 + }, + { + "epoch": 2.2045903799692512, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2350211143493652, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7093101739883423, + "num_tokens": 519516482.0, + "step": 20075 + }, + { + "epoch": 2.2047001976718645, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.54463529586792, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7352381944656372, + "num_tokens": 519540401.0, + "step": 20076 + }, + { + "epoch": 2.2048100153744783, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.523388624191284, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7044517993927002, + "num_tokens": 519564891.0, + "step": 20077 + }, + { + "epoch": 2.204919833077092, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.177218198776245, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7075207233428955, + "num_tokens": 519595185.0, + "step": 20078 + }, + { + "epoch": 2.205029650779706, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.451441526412964, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7113347053527832, + "num_tokens": 519620293.0, + "step": 20079 + }, + { + "epoch": 2.2051394684823196, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.801287889480591, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7248121500015259, + "num_tokens": 519643636.0, + "step": 20080 + }, + { + "epoch": 2.205249286184933, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.335179328918457, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.7010000944137573, + "num_tokens": 519671611.0, + "step": 20081 + }, + { + "epoch": 2.2053591038875466, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1016101837158203, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7015804052352905, + "num_tokens": 519706511.0, + "step": 20082 + }, + { + "epoch": 2.2054689215901604, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6228222846984863, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7248756885528564, + "num_tokens": 519729225.0, + "step": 20083 + }, + { + "epoch": 2.205578739292774, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3276326656341553, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7132880687713623, + "num_tokens": 519757241.0, + "step": 20084 + }, + { + "epoch": 2.2056885569953875, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.344426155090332, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.722261905670166, + "num_tokens": 519783692.0, + "step": 20085 + }, + { + "epoch": 2.205798374698001, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6732993125915527, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7370312213897705, + "num_tokens": 519807339.0, + "step": 20086 + }, + { + "epoch": 2.205908192400615, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6778311729431152, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7282270789146423, + "num_tokens": 519830355.0, + "step": 20087 + }, + { + "epoch": 2.2060180101032287, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.360534191131592, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7148560881614685, + "num_tokens": 519859643.0, + "step": 20088 + }, + { + "epoch": 2.2061278278058425, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6446805000305176, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7264668941497803, + "num_tokens": 519883210.0, + "step": 20089 + }, + { + "epoch": 2.206237645508456, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6628193855285645, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7404319047927856, + "num_tokens": 519903342.0, + "step": 20090 + }, + { + "epoch": 2.2063474632110696, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.342926502227783, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7203496098518372, + "num_tokens": 519930300.0, + "step": 20091 + }, + { + "epoch": 2.2064572809136833, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.304389476776123, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.7044590711593628, + "num_tokens": 519960474.0, + "step": 20092 + }, + { + "epoch": 2.206567098616297, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.335862159729004, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7241459488868713, + "num_tokens": 519986070.0, + "step": 20093 + }, + { + "epoch": 2.2066769163189104, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4637057781219482, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7266409993171692, + "num_tokens": 520011594.0, + "step": 20094 + }, + { + "epoch": 2.206786734021524, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5097744464874268, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6987463235855103, + "num_tokens": 520037705.0, + "step": 20095 + }, + { + "epoch": 2.206896551724138, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8717174530029297, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.721871554851532, + "num_tokens": 520056695.0, + "step": 20096 + }, + { + "epoch": 2.2070063694267517, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6326558589935303, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.727731466293335, + "num_tokens": 520079533.0, + "step": 20097 + }, + { + "epoch": 2.2071161871293654, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.577333450317383, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7134610414505005, + "num_tokens": 520104959.0, + "step": 20098 + }, + { + "epoch": 2.2072260048319787, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5477967262268066, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7369215488433838, + "num_tokens": 520126584.0, + "step": 20099 + }, + { + "epoch": 2.2073358225345925, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4340388774871826, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7195619344711304, + "num_tokens": 520149976.0, + "step": 20100 + }, + { + "epoch": 2.2074456402372062, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.435788869857788, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7218629121780396, + "num_tokens": 520177916.0, + "step": 20101 + }, + { + "epoch": 2.20755545793982, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3689894676208496, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.709663987159729, + "num_tokens": 520205584.0, + "step": 20102 + }, + { + "epoch": 2.2076652756424338, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3377909660339355, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7087175250053406, + "num_tokens": 520234122.0, + "step": 20103 + }, + { + "epoch": 2.207775093345047, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.574808120727539, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6929671168327332, + "num_tokens": 520258717.0, + "step": 20104 + }, + { + "epoch": 2.207884911047661, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4878318309783936, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7268190383911133, + "num_tokens": 520284535.0, + "step": 20105 + }, + { + "epoch": 2.2079947287502746, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4190549850463867, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7356423139572144, + "num_tokens": 520310647.0, + "step": 20106 + }, + { + "epoch": 2.2081045464528883, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.142266273498535, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6971117854118347, + "num_tokens": 520344625.0, + "step": 20107 + }, + { + "epoch": 2.208214364155502, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4231276512145996, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7208216190338135, + "num_tokens": 520368818.0, + "step": 20108 + }, + { + "epoch": 2.2083241818581154, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3642470836639404, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7181841731071472, + "num_tokens": 520396062.0, + "step": 20109 + }, + { + "epoch": 2.208433999560729, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.385441780090332, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7145954370498657, + "num_tokens": 520423499.0, + "step": 20110 + }, + { + "epoch": 2.208543817263343, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.008451223373413, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7024058103561401, + "num_tokens": 520460909.0, + "step": 20111 + }, + { + "epoch": 2.2086536349659567, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.332794427871704, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7235508561134338, + "num_tokens": 520487116.0, + "step": 20112 + }, + { + "epoch": 2.20876345266857, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.296813726425171, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.698003888130188, + "num_tokens": 520516152.0, + "step": 20113 + }, + { + "epoch": 2.2088732703711838, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5307090282440186, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7299619913101196, + "num_tokens": 520539944.0, + "step": 20114 + }, + { + "epoch": 2.2089830880737975, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2932064533233643, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7225829362869263, + "num_tokens": 520567663.0, + "step": 20115 + }, + { + "epoch": 2.2090929057764113, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6615638732910156, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7353077530860901, + "num_tokens": 520590056.0, + "step": 20116 + }, + { + "epoch": 2.209202723479025, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.614778757095337, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.727912962436676, + "num_tokens": 520612078.0, + "step": 20117 + }, + { + "epoch": 2.2093125411816383, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.907707691192627, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7193509340286255, + "num_tokens": 520631927.0, + "step": 20118 + }, + { + "epoch": 2.209422358884252, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3961784839630127, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7132014036178589, + "num_tokens": 520658703.0, + "step": 20119 + }, + { + "epoch": 2.209532176586866, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7934646606445312, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7242512106895447, + "num_tokens": 520677388.0, + "step": 20120 + }, + { + "epoch": 2.2096419942894796, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4342405796051025, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7005035877227783, + "num_tokens": 520706453.0, + "step": 20121 + }, + { + "epoch": 2.209751811992093, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3264386653900146, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7169555425643921, + "num_tokens": 520736876.0, + "step": 20122 + }, + { + "epoch": 2.2098616296947067, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5379552841186523, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6799323558807373, + "num_tokens": 520762241.0, + "step": 20123 + }, + { + "epoch": 2.2099714473973204, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4887828826904297, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7203445434570312, + "num_tokens": 520785181.0, + "step": 20124 + }, + { + "epoch": 2.210081265099934, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.548340320587158, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7131916880607605, + "num_tokens": 520812759.0, + "step": 20125 + }, + { + "epoch": 2.210191082802548, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.731980323791504, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7281603217124939, + "num_tokens": 520839324.0, + "step": 20126 + }, + { + "epoch": 2.2103009005051613, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3564021587371826, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7203016877174377, + "num_tokens": 520867844.0, + "step": 20127 + }, + { + "epoch": 2.210410718207775, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3313393592834473, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6941758394241333, + "num_tokens": 520897337.0, + "step": 20128 + }, + { + "epoch": 2.2105205359103888, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5160040855407715, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7366145253181458, + "num_tokens": 520922710.0, + "step": 20129 + }, + { + "epoch": 2.2106303536130025, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.544633626937866, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.70260089635849, + "num_tokens": 520948579.0, + "step": 20130 + }, + { + "epoch": 2.2107401713156163, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.462221384048462, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7039536237716675, + "num_tokens": 520976304.0, + "step": 20131 + }, + { + "epoch": 2.2108499890182296, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2894375324249268, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.679844081401825, + "num_tokens": 521005547.0, + "step": 20132 + }, + { + "epoch": 2.2109598067208434, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.487347364425659, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7146767377853394, + "num_tokens": 521032555.0, + "step": 20133 + }, + { + "epoch": 2.211069624423457, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3886425495147705, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6959760189056396, + "num_tokens": 521059832.0, + "step": 20134 + }, + { + "epoch": 2.211179442126071, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.39200496673584, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7050788998603821, + "num_tokens": 521088278.0, + "step": 20135 + }, + { + "epoch": 2.211289259828684, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.646012783050537, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7021640539169312, + "num_tokens": 521111519.0, + "step": 20136 + }, + { + "epoch": 2.211399077531298, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.303612232208252, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7216221690177917, + "num_tokens": 521139870.0, + "step": 20137 + }, + { + "epoch": 2.2115088952339117, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3827641010284424, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7173622846603394, + "num_tokens": 521167763.0, + "step": 20138 + }, + { + "epoch": 2.2116187129365255, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.68228816986084, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7127166986465454, + "num_tokens": 521193950.0, + "step": 20139 + }, + { + "epoch": 2.211728530639139, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.163935422897339, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7084099650382996, + "num_tokens": 521227519.0, + "step": 20140 + }, + { + "epoch": 2.2118383483417525, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9245121479034424, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7224969863891602, + "num_tokens": 521247804.0, + "step": 20141 + }, + { + "epoch": 2.2119481660443663, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7836456298828125, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7075890302658081, + "num_tokens": 521269010.0, + "step": 20142 + }, + { + "epoch": 2.21205798374698, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.668323040008545, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7061524391174316, + "num_tokens": 521293769.0, + "step": 20143 + }, + { + "epoch": 2.212167801449594, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.585559606552124, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7209527492523193, + "num_tokens": 521318842.0, + "step": 20144 + }, + { + "epoch": 2.212277619152207, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3165900707244873, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7137603759765625, + "num_tokens": 521346698.0, + "step": 20145 + }, + { + "epoch": 2.212387436854821, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5167040824890137, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7267782688140869, + "num_tokens": 521371578.0, + "step": 20146 + }, + { + "epoch": 2.2124972545574346, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5948357582092285, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7311092019081116, + "num_tokens": 521397721.0, + "step": 20147 + }, + { + "epoch": 2.2126070722600484, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.647083044052124, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7138199210166931, + "num_tokens": 521418959.0, + "step": 20148 + }, + { + "epoch": 2.212716889962662, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.448728561401367, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7085431814193726, + "num_tokens": 521447973.0, + "step": 20149 + }, + { + "epoch": 2.2128267076652754, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3397555351257324, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7126951217651367, + "num_tokens": 521475234.0, + "step": 20150 + }, + { + "epoch": 2.212936525367889, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.499244451522827, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7273557782173157, + "num_tokens": 521499190.0, + "step": 20151 + }, + { + "epoch": 2.213046343070503, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.627457857131958, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7162922620773315, + "num_tokens": 521525491.0, + "step": 20152 + }, + { + "epoch": 2.2131561607731167, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6114954948425293, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7083637714385986, + "num_tokens": 521548624.0, + "step": 20153 + }, + { + "epoch": 2.2132659784757305, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2494945526123047, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6980115175247192, + "num_tokens": 521579307.0, + "step": 20154 + }, + { + "epoch": 2.213375796178344, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.548884868621826, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.73907470703125, + "num_tokens": 521602114.0, + "step": 20155 + }, + { + "epoch": 2.2134856138809575, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4980227947235107, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7366774082183838, + "num_tokens": 521625068.0, + "step": 20156 + }, + { + "epoch": 2.2135954315835713, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4341530799865723, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7253859639167786, + "num_tokens": 521649794.0, + "step": 20157 + }, + { + "epoch": 2.213705249286185, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8269848823547363, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7132415771484375, + "num_tokens": 521669433.0, + "step": 20158 + }, + { + "epoch": 2.213815066988799, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5011632442474365, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7101643085479736, + "num_tokens": 521691755.0, + "step": 20159 + }, + { + "epoch": 2.213924884691412, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.355578660964966, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7176516652107239, + "num_tokens": 521717300.0, + "step": 20160 + }, + { + "epoch": 2.214034702394026, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3063621520996094, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.68489670753479, + "num_tokens": 521746826.0, + "step": 20161 + }, + { + "epoch": 2.2141445200966396, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.656298875808716, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7268356084823608, + "num_tokens": 521767377.0, + "step": 20162 + }, + { + "epoch": 2.2142543377992534, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.404958486557007, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7184114456176758, + "num_tokens": 521792448.0, + "step": 20163 + }, + { + "epoch": 2.2143641555018667, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4104719161987305, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.718671441078186, + "num_tokens": 521817314.0, + "step": 20164 + }, + { + "epoch": 2.2144739732044805, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5412046909332275, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.7065286040306091, + "num_tokens": 521844677.0, + "step": 20165 + }, + { + "epoch": 2.2145837909070942, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7154717445373535, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7449674010276794, + "num_tokens": 521865563.0, + "step": 20166 + }, + { + "epoch": 2.214693608609708, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6635191440582275, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7270883321762085, + "num_tokens": 521885758.0, + "step": 20167 + }, + { + "epoch": 2.2148034263123217, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.621690034866333, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7478992938995361, + "num_tokens": 521907417.0, + "step": 20168 + }, + { + "epoch": 2.214913244014935, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.340378522872925, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.711174488067627, + "num_tokens": 521934415.0, + "step": 20169 + }, + { + "epoch": 2.215023061717549, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.794835090637207, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7074248194694519, + "num_tokens": 521956542.0, + "step": 20170 + }, + { + "epoch": 2.2151328794201626, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.359994411468506, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.733095645904541, + "num_tokens": 521981406.0, + "step": 20171 + }, + { + "epoch": 2.2152426971227763, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.410527467727661, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7119941711425781, + "num_tokens": 522006913.0, + "step": 20172 + }, + { + "epoch": 2.2153525148253896, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3866820335388184, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7050390243530273, + "num_tokens": 522032649.0, + "step": 20173 + }, + { + "epoch": 2.2154623325280034, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 32.3661003112793, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.721646785736084, + "num_tokens": 522054575.0, + "step": 20174 + }, + { + "epoch": 2.215572150230617, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1327290534973145, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6905947327613831, + "num_tokens": 522088974.0, + "step": 20175 + }, + { + "epoch": 2.215681967933231, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.516206741333008, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7122286558151245, + "num_tokens": 522112488.0, + "step": 20176 + }, + { + "epoch": 2.2157917856358447, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.685429334640503, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7407680749893188, + "num_tokens": 522133063.0, + "step": 20177 + }, + { + "epoch": 2.215901603338458, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3404955863952637, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7050775289535522, + "num_tokens": 522161195.0, + "step": 20178 + }, + { + "epoch": 2.2160114210410717, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3733770847320557, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7183321118354797, + "num_tokens": 522188462.0, + "step": 20179 + }, + { + "epoch": 2.2161212387436855, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2753195762634277, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7006351947784424, + "num_tokens": 522218385.0, + "step": 20180 + }, + { + "epoch": 2.2162310564462993, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.155442476272583, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.723020613193512, + "num_tokens": 522250015.0, + "step": 20181 + }, + { + "epoch": 2.216340874148913, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3933048248291016, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7133538126945496, + "num_tokens": 522276403.0, + "step": 20182 + }, + { + "epoch": 2.2164506918515263, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.210770845413208, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6998571157455444, + "num_tokens": 522309096.0, + "step": 20183 + }, + { + "epoch": 2.21656050955414, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.330812692642212, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6950304508209229, + "num_tokens": 522337840.0, + "step": 20184 + }, + { + "epoch": 2.216670327256754, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4083027839660645, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7484925389289856, + "num_tokens": 522361322.0, + "step": 20185 + }, + { + "epoch": 2.2167801449593676, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5782175064086914, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7357807159423828, + "num_tokens": 522382718.0, + "step": 20186 + }, + { + "epoch": 2.216889962661981, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4194281101226807, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7374535799026489, + "num_tokens": 522407048.0, + "step": 20187 + }, + { + "epoch": 2.2169997803645947, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4329283237457275, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7570885419845581, + "num_tokens": 522432975.0, + "step": 20188 + }, + { + "epoch": 2.2171095980672084, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.407599925994873, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7197117805480957, + "num_tokens": 522460947.0, + "step": 20189 + }, + { + "epoch": 2.217219415769822, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1837353706359863, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7207458019256592, + "num_tokens": 522491659.0, + "step": 20190 + }, + { + "epoch": 2.217329233472436, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7130351066589355, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7212209701538086, + "num_tokens": 522513786.0, + "step": 20191 + }, + { + "epoch": 2.2174390511750492, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.384758472442627, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7234936356544495, + "num_tokens": 522540420.0, + "step": 20192 + }, + { + "epoch": 2.217548868877663, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7563767433166504, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7349662184715271, + "num_tokens": 522560474.0, + "step": 20193 + }, + { + "epoch": 2.2176586865802768, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5657546520233154, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7281744480133057, + "num_tokens": 522583465.0, + "step": 20194 + }, + { + "epoch": 2.2177685042828905, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4139316082000732, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.718170166015625, + "num_tokens": 522609422.0, + "step": 20195 + }, + { + "epoch": 2.217878321985504, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4873080253601074, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7135758399963379, + "num_tokens": 522634318.0, + "step": 20196 + }, + { + "epoch": 2.2179881396881176, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5466041564941406, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7313523292541504, + "num_tokens": 522658245.0, + "step": 20197 + }, + { + "epoch": 2.2180979573907313, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.640080213546753, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7167688608169556, + "num_tokens": 522682109.0, + "step": 20198 + }, + { + "epoch": 2.218207775093345, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1872949600219727, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7119951248168945, + "num_tokens": 522713748.0, + "step": 20199 + }, + { + "epoch": 2.218317592795959, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.326467752456665, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6881768703460693, + "num_tokens": 522742450.0, + "step": 20200 + }, + { + "epoch": 2.218427410498572, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3000621795654297, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.709600567817688, + "num_tokens": 522771817.0, + "step": 20201 + }, + { + "epoch": 2.218537228201186, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2705917358398438, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6993907690048218, + "num_tokens": 522799701.0, + "step": 20202 + }, + { + "epoch": 2.2186470459037997, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.534074544906616, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.718762218952179, + "num_tokens": 522824493.0, + "step": 20203 + }, + { + "epoch": 2.2187568636064134, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7669079303741455, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7080250978469849, + "num_tokens": 522846289.0, + "step": 20204 + }, + { + "epoch": 2.218866681309027, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.740361452102661, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7325390577316284, + "num_tokens": 522867454.0, + "step": 20205 + }, + { + "epoch": 2.2189764990116405, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5231359004974365, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.716132640838623, + "num_tokens": 522889945.0, + "step": 20206 + }, + { + "epoch": 2.2190863167142543, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1763715744018555, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7200499773025513, + "num_tokens": 522923535.0, + "step": 20207 + }, + { + "epoch": 2.219196134416868, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.646592140197754, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7160676121711731, + "num_tokens": 522945158.0, + "step": 20208 + }, + { + "epoch": 2.219305952119482, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3284554481506348, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7164583802223206, + "num_tokens": 522975291.0, + "step": 20209 + }, + { + "epoch": 2.2194157698220955, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3998727798461914, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7199878096580505, + "num_tokens": 523002590.0, + "step": 20210 + }, + { + "epoch": 2.219525587524709, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.754929542541504, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7071094512939453, + "num_tokens": 523024099.0, + "step": 20211 + }, + { + "epoch": 2.2196354052273226, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6328957080841064, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7140244245529175, + "num_tokens": 523046434.0, + "step": 20212 + }, + { + "epoch": 2.2197452229299364, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.654655933380127, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7245479822158813, + "num_tokens": 523069020.0, + "step": 20213 + }, + { + "epoch": 2.21985504063255, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6690571308135986, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7234684228897095, + "num_tokens": 523092677.0, + "step": 20214 + }, + { + "epoch": 2.2199648583351634, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.305529832839966, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7082467675209045, + "num_tokens": 523125967.0, + "step": 20215 + }, + { + "epoch": 2.220074676037777, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.506295680999756, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7175841927528381, + "num_tokens": 523152192.0, + "step": 20216 + }, + { + "epoch": 2.220184493740391, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.634883403778076, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7108004689216614, + "num_tokens": 523174814.0, + "step": 20217 + }, + { + "epoch": 2.2202943114430047, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4884445667266846, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7064775824546814, + "num_tokens": 523199753.0, + "step": 20218 + }, + { + "epoch": 2.2204041291456185, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3834056854248047, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7061679363250732, + "num_tokens": 523228109.0, + "step": 20219 + }, + { + "epoch": 2.220513946848232, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3202366828918457, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.707996129989624, + "num_tokens": 523257988.0, + "step": 20220 + }, + { + "epoch": 2.2206237645508455, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8157882690429688, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7259615063667297, + "num_tokens": 523277505.0, + "step": 20221 + }, + { + "epoch": 2.2207335822534593, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.388584852218628, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7087869048118591, + "num_tokens": 523304940.0, + "step": 20222 + }, + { + "epoch": 2.220843399956073, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4280383586883545, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7023583650588989, + "num_tokens": 523332632.0, + "step": 20223 + }, + { + "epoch": 2.2209532176586864, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.590024709701538, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6892574429512024, + "num_tokens": 523361444.0, + "step": 20224 + }, + { + "epoch": 2.2210630353613, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7298412322998047, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7176433801651001, + "num_tokens": 523383465.0, + "step": 20225 + }, + { + "epoch": 2.221172853063914, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.547837018966675, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7345147728919983, + "num_tokens": 523406351.0, + "step": 20226 + }, + { + "epoch": 2.2212826707665276, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5454728603363037, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7336476445198059, + "num_tokens": 523432018.0, + "step": 20227 + }, + { + "epoch": 2.2213924884691414, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2951653003692627, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6983135938644409, + "num_tokens": 523463555.0, + "step": 20228 + }, + { + "epoch": 2.2215023061717547, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.755722761154175, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7205209732055664, + "num_tokens": 523485185.0, + "step": 20229 + }, + { + "epoch": 2.2216121238743685, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5615720748901367, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.694329559803009, + "num_tokens": 523509089.0, + "step": 20230 + }, + { + "epoch": 2.221721941576982, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.483551025390625, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7192606329917908, + "num_tokens": 523534084.0, + "step": 20231 + }, + { + "epoch": 2.221831759279596, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.286439895629883, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7008577585220337, + "num_tokens": 523564309.0, + "step": 20232 + }, + { + "epoch": 2.2219415769822097, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.410477876663208, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7295814752578735, + "num_tokens": 523588865.0, + "step": 20233 + }, + { + "epoch": 2.222051394684823, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.297510862350464, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6923162937164307, + "num_tokens": 523620490.0, + "step": 20234 + }, + { + "epoch": 2.222161212387437, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4793264865875244, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7241463661193848, + "num_tokens": 523645323.0, + "step": 20235 + }, + { + "epoch": 2.2222710300900506, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.594999313354492, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7043952941894531, + "num_tokens": 523668337.0, + "step": 20236 + }, + { + "epoch": 2.2223808477926643, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4653425216674805, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7222042679786682, + "num_tokens": 523692874.0, + "step": 20237 + }, + { + "epoch": 2.222490665495278, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6112871170043945, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7362490892410278, + "num_tokens": 523714459.0, + "step": 20238 + }, + { + "epoch": 2.2226004831978914, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7054154872894287, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7204462289810181, + "num_tokens": 523736494.0, + "step": 20239 + }, + { + "epoch": 2.222710300900505, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3956615924835205, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.704411506652832, + "num_tokens": 523766414.0, + "step": 20240 + }, + { + "epoch": 2.222820118603119, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4240646362304688, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.707135796546936, + "num_tokens": 523794278.0, + "step": 20241 + }, + { + "epoch": 2.2229299363057327, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.061558246612549, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7318840026855469, + "num_tokens": 523829092.0, + "step": 20242 + }, + { + "epoch": 2.223039754008346, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9030544757843018, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7269238233566284, + "num_tokens": 523847786.0, + "step": 20243 + }, + { + "epoch": 2.2231495717109597, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5458192825317383, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6993215084075928, + "num_tokens": 523873224.0, + "step": 20244 + }, + { + "epoch": 2.2232593894135735, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.563185214996338, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7268549799919128, + "num_tokens": 523899560.0, + "step": 20245 + }, + { + "epoch": 2.2233692071161872, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2982747554779053, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7095189094543457, + "num_tokens": 523926135.0, + "step": 20246 + }, + { + "epoch": 2.223479024818801, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.864539861679077, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7221142649650574, + "num_tokens": 523947372.0, + "step": 20247 + }, + { + "epoch": 2.2235888425214143, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.528193950653076, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7119134664535522, + "num_tokens": 523972720.0, + "step": 20248 + }, + { + "epoch": 2.223698660224028, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9108428955078125, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7479949593544006, + "num_tokens": 523990408.0, + "step": 20249 + }, + { + "epoch": 2.223808477926642, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.383192300796509, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7197868227958679, + "num_tokens": 524018298.0, + "step": 20250 + }, + { + "epoch": 2.2239182956292556, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.272902488708496, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7093828916549683, + "num_tokens": 524047897.0, + "step": 20251 + }, + { + "epoch": 2.224028113331869, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6294546127319336, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6941389441490173, + "num_tokens": 524072670.0, + "step": 20252 + }, + { + "epoch": 2.2241379310344827, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7281687259674072, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7359245419502258, + "num_tokens": 524094237.0, + "step": 20253 + }, + { + "epoch": 2.2242477487370964, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6863558292388916, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6977893114089966, + "num_tokens": 524119778.0, + "step": 20254 + }, + { + "epoch": 2.22435756643971, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3126370906829834, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.714537501335144, + "num_tokens": 524147937.0, + "step": 20255 + }, + { + "epoch": 2.224467384142324, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1223037242889404, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7175552845001221, + "num_tokens": 524182165.0, + "step": 20256 + }, + { + "epoch": 2.2245772018449372, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8044393062591553, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7390066385269165, + "num_tokens": 524201340.0, + "step": 20257 + }, + { + "epoch": 2.224687019547551, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5653133392333984, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7127468585968018, + "num_tokens": 524226267.0, + "step": 20258 + }, + { + "epoch": 2.2247968372501647, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4794678688049316, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7274600267410278, + "num_tokens": 524249983.0, + "step": 20259 + }, + { + "epoch": 2.2249066549527785, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3998701572418213, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6937015056610107, + "num_tokens": 524277426.0, + "step": 20260 + }, + { + "epoch": 2.2250164726553923, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.462880849838257, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7032939791679382, + "num_tokens": 524302820.0, + "step": 20261 + }, + { + "epoch": 2.2251262903580056, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3879473209381104, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7146079540252686, + "num_tokens": 524331358.0, + "step": 20262 + }, + { + "epoch": 2.2252361080606193, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.347730875015259, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7110239863395691, + "num_tokens": 524358418.0, + "step": 20263 + }, + { + "epoch": 2.225345925763233, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6497292518615723, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7473034858703613, + "num_tokens": 524379393.0, + "step": 20264 + }, + { + "epoch": 2.225455743465847, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.282379388809204, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7272871732711792, + "num_tokens": 524406388.0, + "step": 20265 + }, + { + "epoch": 2.22556556116846, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3082900047302246, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6985809206962585, + "num_tokens": 524434241.0, + "step": 20266 + }, + { + "epoch": 2.225675378871074, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.269169569015503, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.7025917768478394, + "num_tokens": 524461854.0, + "step": 20267 + }, + { + "epoch": 2.2257851965736877, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.117239236831665, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.68056720495224, + "num_tokens": 524496859.0, + "step": 20268 + }, + { + "epoch": 2.2258950142763014, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.532186985015869, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6999205946922302, + "num_tokens": 524523319.0, + "step": 20269 + }, + { + "epoch": 2.226004831978915, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.762700319290161, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7376110553741455, + "num_tokens": 524543772.0, + "step": 20270 + }, + { + "epoch": 2.2261146496815285, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.400867462158203, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7001543641090393, + "num_tokens": 524570526.0, + "step": 20271 + }, + { + "epoch": 2.2262244673841423, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.886072874069214, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7241151332855225, + "num_tokens": 524591893.0, + "step": 20272 + }, + { + "epoch": 2.226334285086756, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.506793260574341, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7179466485977173, + "num_tokens": 524617959.0, + "step": 20273 + }, + { + "epoch": 2.2264441027893698, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1845614910125732, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7254215478897095, + "num_tokens": 524650018.0, + "step": 20274 + }, + { + "epoch": 2.226553920491983, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3431503772735596, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7135661244392395, + "num_tokens": 524677642.0, + "step": 20275 + }, + { + "epoch": 2.226663738194597, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.632274866104126, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.713814914226532, + "num_tokens": 524700131.0, + "step": 20276 + }, + { + "epoch": 2.2267735558972106, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.597083568572998, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7315125465393066, + "num_tokens": 524723625.0, + "step": 20277 + }, + { + "epoch": 2.2268833735998244, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.385864734649658, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7074677348136902, + "num_tokens": 524749654.0, + "step": 20278 + }, + { + "epoch": 2.226993191302438, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6429460048675537, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7276726961135864, + "num_tokens": 524770999.0, + "step": 20279 + }, + { + "epoch": 2.2271030090050514, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3273966312408447, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7330226898193359, + "num_tokens": 524798476.0, + "step": 20280 + }, + { + "epoch": 2.227212826707665, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4036056995391846, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7047656774520874, + "num_tokens": 524830257.0, + "step": 20281 + }, + { + "epoch": 2.227322644410279, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2220277786254883, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7224265933036804, + "num_tokens": 524860125.0, + "step": 20282 + }, + { + "epoch": 2.2274324621128927, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3144965171813965, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7166486978530884, + "num_tokens": 524886906.0, + "step": 20283 + }, + { + "epoch": 2.2275422798155065, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.389162063598633, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7143145203590393, + "num_tokens": 524913202.0, + "step": 20284 + }, + { + "epoch": 2.2276520975181198, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8775107860565186, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7368048429489136, + "num_tokens": 524934388.0, + "step": 20285 + }, + { + "epoch": 2.2277619152207335, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.766240119934082, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7271006107330322, + "num_tokens": 524954701.0, + "step": 20286 + }, + { + "epoch": 2.2278717329233473, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.329374313354492, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7275595664978027, + "num_tokens": 524982373.0, + "step": 20287 + }, + { + "epoch": 2.227981550625961, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.50742769241333, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7274026870727539, + "num_tokens": 525006604.0, + "step": 20288 + }, + { + "epoch": 2.228091368328575, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.609140157699585, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7306628823280334, + "num_tokens": 525030729.0, + "step": 20289 + }, + { + "epoch": 2.228201186031188, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.494823694229126, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7035065293312073, + "num_tokens": 525057137.0, + "step": 20290 + }, + { + "epoch": 2.228311003733802, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.574096202850342, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7123891115188599, + "num_tokens": 525083368.0, + "step": 20291 + }, + { + "epoch": 2.2284208214364156, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5147206783294678, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7114479541778564, + "num_tokens": 525106564.0, + "step": 20292 + }, + { + "epoch": 2.2285306391390294, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4226772785186768, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7204428315162659, + "num_tokens": 525131346.0, + "step": 20293 + }, + { + "epoch": 2.2286404568416427, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5756077766418457, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7113768458366394, + "num_tokens": 525153578.0, + "step": 20294 + }, + { + "epoch": 2.2287502745442564, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3231241703033447, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.720590353012085, + "num_tokens": 525180802.0, + "step": 20295 + }, + { + "epoch": 2.22886009224687, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.450723648071289, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6990957856178284, + "num_tokens": 525207531.0, + "step": 20296 + }, + { + "epoch": 2.228969909949484, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3052127361297607, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7266223430633545, + "num_tokens": 525235532.0, + "step": 20297 + }, + { + "epoch": 2.2290797276520977, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.566725492477417, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6959043741226196, + "num_tokens": 525260748.0, + "step": 20298 + }, + { + "epoch": 2.229189545354711, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3114304542541504, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7120518684387207, + "num_tokens": 525290637.0, + "step": 20299 + }, + { + "epoch": 2.229299363057325, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.736978769302368, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7136102914810181, + "num_tokens": 525311476.0, + "step": 20300 + }, + { + "epoch": 2.2294091807599385, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4839837551116943, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7233729362487793, + "num_tokens": 525336307.0, + "step": 20301 + }, + { + "epoch": 2.2295189984625523, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5672202110290527, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7142214775085449, + "num_tokens": 525360801.0, + "step": 20302 + }, + { + "epoch": 2.2296288161651656, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.885136604309082, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.711784839630127, + "num_tokens": 525381272.0, + "step": 20303 + }, + { + "epoch": 2.2297386338677794, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.564490556716919, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6966010332107544, + "num_tokens": 525405143.0, + "step": 20304 + }, + { + "epoch": 2.229848451570393, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.474698543548584, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7062398195266724, + "num_tokens": 525432417.0, + "step": 20305 + }, + { + "epoch": 2.229958269273007, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3326010704040527, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7109683752059937, + "num_tokens": 525458740.0, + "step": 20306 + }, + { + "epoch": 2.2300680869756206, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5069029331207275, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.713422417640686, + "num_tokens": 525483970.0, + "step": 20307 + }, + { + "epoch": 2.230177904678234, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3980560302734375, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7168715596199036, + "num_tokens": 525511566.0, + "step": 20308 + }, + { + "epoch": 2.2302877223808477, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.523690938949585, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7358678579330444, + "num_tokens": 525534404.0, + "step": 20309 + }, + { + "epoch": 2.2303975400834615, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4973409175872803, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7096811532974243, + "num_tokens": 525559670.0, + "step": 20310 + }, + { + "epoch": 2.2305073577860752, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7412760257720947, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7284277677536011, + "num_tokens": 525582828.0, + "step": 20311 + }, + { + "epoch": 2.230617175488689, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.69889760017395, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7176512479782104, + "num_tokens": 525605437.0, + "step": 20312 + }, + { + "epoch": 2.2307269931913023, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.685354471206665, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7326292395591736, + "num_tokens": 525628750.0, + "step": 20313 + }, + { + "epoch": 2.230836810893916, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.475325584411621, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7180412411689758, + "num_tokens": 525652681.0, + "step": 20314 + }, + { + "epoch": 2.23094662859653, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.575014352798462, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7268886566162109, + "num_tokens": 525675705.0, + "step": 20315 + }, + { + "epoch": 2.2310564462991436, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3298532962799072, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7206056118011475, + "num_tokens": 525702444.0, + "step": 20316 + }, + { + "epoch": 2.231166264001757, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.723717451095581, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7325748205184937, + "num_tokens": 525725438.0, + "step": 20317 + }, + { + "epoch": 2.2312760817043706, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2224175930023193, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7155759334564209, + "num_tokens": 525755652.0, + "step": 20318 + }, + { + "epoch": 2.2313858994069844, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4896674156188965, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7220010161399841, + "num_tokens": 525781826.0, + "step": 20319 + }, + { + "epoch": 2.231495717109598, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.108283519744873, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6922787427902222, + "num_tokens": 525820208.0, + "step": 20320 + }, + { + "epoch": 2.231605534812212, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5373876094818115, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7252229452133179, + "num_tokens": 525841822.0, + "step": 20321 + }, + { + "epoch": 2.2317153525148252, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1926968097686768, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6891070008277893, + "num_tokens": 525873699.0, + "step": 20322 + }, + { + "epoch": 2.231825170217439, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7559621334075928, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7029078006744385, + "num_tokens": 525896504.0, + "step": 20323 + }, + { + "epoch": 2.2319349879200527, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.306044578552246, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7343958020210266, + "num_tokens": 525923052.0, + "step": 20324 + }, + { + "epoch": 2.2320448056226665, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4414377212524414, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7060809135437012, + "num_tokens": 525950202.0, + "step": 20325 + }, + { + "epoch": 2.23215462332528, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.124809741973877, + "learning_rate": 1e-06, + "loss": 1.0966, + "mean_token_accuracy": 0.6885201334953308, + "num_tokens": 525983447.0, + "step": 20326 + }, + { + "epoch": 2.2322644410278936, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.392418622970581, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6886433362960815, + "num_tokens": 526010645.0, + "step": 20327 + }, + { + "epoch": 2.2323742587305073, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.644223928451538, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7067054510116577, + "num_tokens": 526033189.0, + "step": 20328 + }, + { + "epoch": 2.232484076433121, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5706589221954346, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7072238922119141, + "num_tokens": 526057076.0, + "step": 20329 + }, + { + "epoch": 2.232593894135735, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.290415048599243, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6939862966537476, + "num_tokens": 526088194.0, + "step": 20330 + }, + { + "epoch": 2.232703711838348, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5667848587036133, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6940853595733643, + "num_tokens": 526112889.0, + "step": 20331 + }, + { + "epoch": 2.232813529540962, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3224544525146484, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7174063920974731, + "num_tokens": 526140702.0, + "step": 20332 + }, + { + "epoch": 2.2329233472435757, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.184492349624634, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6802080273628235, + "num_tokens": 526172674.0, + "step": 20333 + }, + { + "epoch": 2.2330331649461894, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6892855167388916, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7332741022109985, + "num_tokens": 526194778.0, + "step": 20334 + }, + { + "epoch": 2.233142982648803, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.540191650390625, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7112194299697876, + "num_tokens": 526221856.0, + "step": 20335 + }, + { + "epoch": 2.2332528003514165, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5594515800476074, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.722730278968811, + "num_tokens": 526245796.0, + "step": 20336 + }, + { + "epoch": 2.2333626180540302, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.48726749420166, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7272793054580688, + "num_tokens": 526269914.0, + "step": 20337 + }, + { + "epoch": 2.233472435756644, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5728888511657715, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7481206059455872, + "num_tokens": 526292531.0, + "step": 20338 + }, + { + "epoch": 2.2335822534592578, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8034114837646484, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7254575490951538, + "num_tokens": 526311733.0, + "step": 20339 + }, + { + "epoch": 2.2336920711618715, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2840795516967773, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7405152916908264, + "num_tokens": 526338515.0, + "step": 20340 + }, + { + "epoch": 2.233801888864485, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2857460975646973, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.732125997543335, + "num_tokens": 526371474.0, + "step": 20341 + }, + { + "epoch": 2.2339117065670986, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.947026014328003, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7226740121841431, + "num_tokens": 526390129.0, + "step": 20342 + }, + { + "epoch": 2.2340215242697123, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.316906690597534, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7215873003005981, + "num_tokens": 526417402.0, + "step": 20343 + }, + { + "epoch": 2.234131341972326, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4259064197540283, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7425265312194824, + "num_tokens": 526440645.0, + "step": 20344 + }, + { + "epoch": 2.2342411596749394, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4019241333007812, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7441704869270325, + "num_tokens": 526466874.0, + "step": 20345 + }, + { + "epoch": 2.234350977377553, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5543293952941895, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7157309055328369, + "num_tokens": 526490122.0, + "step": 20346 + }, + { + "epoch": 2.234460795080167, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.25295352935791, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6969569325447083, + "num_tokens": 526517745.0, + "step": 20347 + }, + { + "epoch": 2.2345706127827807, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.615407705307007, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7308468818664551, + "num_tokens": 526542473.0, + "step": 20348 + }, + { + "epoch": 2.2346804304853944, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.474987030029297, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6892002820968628, + "num_tokens": 526567896.0, + "step": 20349 + }, + { + "epoch": 2.2347902481880078, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7729556560516357, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7325426340103149, + "num_tokens": 526588527.0, + "step": 20350 + }, + { + "epoch": 2.2349000658906215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.635798931121826, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7342585325241089, + "num_tokens": 526610904.0, + "step": 20351 + }, + { + "epoch": 2.2350098835932353, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.299647331237793, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7205079793930054, + "num_tokens": 526638307.0, + "step": 20352 + }, + { + "epoch": 2.235119701295849, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7110135555267334, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7329004406929016, + "num_tokens": 526657833.0, + "step": 20353 + }, + { + "epoch": 2.2352295189984623, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2363100051879883, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.699248731136322, + "num_tokens": 526688820.0, + "step": 20354 + }, + { + "epoch": 2.235339336701076, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.662601947784424, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7167556881904602, + "num_tokens": 526712067.0, + "step": 20355 + }, + { + "epoch": 2.23544915440369, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.709287643432617, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7242282032966614, + "num_tokens": 526734605.0, + "step": 20356 + }, + { + "epoch": 2.2355589721063036, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.325850248336792, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7040266394615173, + "num_tokens": 526763488.0, + "step": 20357 + }, + { + "epoch": 2.2356687898089174, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4368813037872314, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7151070833206177, + "num_tokens": 526788452.0, + "step": 20358 + }, + { + "epoch": 2.2357786075115307, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4807684421539307, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7139687538146973, + "num_tokens": 526812108.0, + "step": 20359 + }, + { + "epoch": 2.2358884252141444, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.544619560241699, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7221803665161133, + "num_tokens": 526836705.0, + "step": 20360 + }, + { + "epoch": 2.235998242916758, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.422393321990967, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7353529930114746, + "num_tokens": 526860268.0, + "step": 20361 + }, + { + "epoch": 2.236108060619372, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3191943168640137, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7138299345970154, + "num_tokens": 526888612.0, + "step": 20362 + }, + { + "epoch": 2.2362178783219857, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1873793601989746, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.746422529220581, + "num_tokens": 526918780.0, + "step": 20363 + }, + { + "epoch": 2.236327696024599, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2534596920013428, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7268336415290833, + "num_tokens": 526947039.0, + "step": 20364 + }, + { + "epoch": 2.236437513727213, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.649433135986328, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7486979365348816, + "num_tokens": 526967088.0, + "step": 20365 + }, + { + "epoch": 2.2365473314298265, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7230377197265625, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7146040797233582, + "num_tokens": 526988400.0, + "step": 20366 + }, + { + "epoch": 2.2366571491324403, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5568222999572754, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7035051584243774, + "num_tokens": 527015572.0, + "step": 20367 + }, + { + "epoch": 2.2367669668350536, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.62648868560791, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7308324575424194, + "num_tokens": 527041790.0, + "step": 20368 + }, + { + "epoch": 2.2368767845376674, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.465439796447754, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7221603393554688, + "num_tokens": 527066764.0, + "step": 20369 + }, + { + "epoch": 2.236986602240281, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.526130199432373, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7176737189292908, + "num_tokens": 527092131.0, + "step": 20370 + }, + { + "epoch": 2.237096419942895, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.574911594390869, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7018855810165405, + "num_tokens": 527115857.0, + "step": 20371 + }, + { + "epoch": 2.2372062376455086, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.37572979927063, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7023234367370605, + "num_tokens": 527146876.0, + "step": 20372 + }, + { + "epoch": 2.237316055348122, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7530620098114014, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7289210557937622, + "num_tokens": 527167614.0, + "step": 20373 + }, + { + "epoch": 2.2374258730507357, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.605195999145508, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7156606912612915, + "num_tokens": 527191952.0, + "step": 20374 + }, + { + "epoch": 2.2375356907533495, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.701107978820801, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7443389892578125, + "num_tokens": 527211500.0, + "step": 20375 + }, + { + "epoch": 2.237645508455963, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4698967933654785, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7249910831451416, + "num_tokens": 527236686.0, + "step": 20376 + }, + { + "epoch": 2.2377553261585765, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.571866512298584, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7260291576385498, + "num_tokens": 527260395.0, + "step": 20377 + }, + { + "epoch": 2.2378651438611903, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6668050289154053, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7147922515869141, + "num_tokens": 527282151.0, + "step": 20378 + }, + { + "epoch": 2.237974961563804, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.621649742126465, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7122901678085327, + "num_tokens": 527306096.0, + "step": 20379 + }, + { + "epoch": 2.238084779266418, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.998546600341797, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7147116661071777, + "num_tokens": 527325235.0, + "step": 20380 + }, + { + "epoch": 2.2381945969690316, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.602221727371216, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7348230481147766, + "num_tokens": 527347389.0, + "step": 20381 + }, + { + "epoch": 2.238304414671645, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5288150310516357, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6969826221466064, + "num_tokens": 527373511.0, + "step": 20382 + }, + { + "epoch": 2.2384142323742586, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1615183353424072, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7154369354248047, + "num_tokens": 527402166.0, + "step": 20383 + }, + { + "epoch": 2.2385240500768724, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7513911724090576, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7231711149215698, + "num_tokens": 527422959.0, + "step": 20384 + }, + { + "epoch": 2.238633867779486, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4076426029205322, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7074923515319824, + "num_tokens": 527450587.0, + "step": 20385 + }, + { + "epoch": 2.2387436854821, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.426790714263916, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7284329533576965, + "num_tokens": 527476126.0, + "step": 20386 + }, + { + "epoch": 2.238853503184713, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7554128170013428, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7153390645980835, + "num_tokens": 527496509.0, + "step": 20387 + }, + { + "epoch": 2.238963320887327, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.625701904296875, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7261356115341187, + "num_tokens": 527518868.0, + "step": 20388 + }, + { + "epoch": 2.2390731385899407, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.39166522026062, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6981179118156433, + "num_tokens": 527547509.0, + "step": 20389 + }, + { + "epoch": 2.2391829562925545, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.201932430267334, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7116000652313232, + "num_tokens": 527579173.0, + "step": 20390 + }, + { + "epoch": 2.2392927739951682, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.797912836074829, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7381289005279541, + "num_tokens": 527599196.0, + "step": 20391 + }, + { + "epoch": 2.2394025916977816, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.154958724975586, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7226914763450623, + "num_tokens": 527629669.0, + "step": 20392 + }, + { + "epoch": 2.2395124094003953, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1876447200775146, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7156145572662354, + "num_tokens": 527660922.0, + "step": 20393 + }, + { + "epoch": 2.239622227103009, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.808499813079834, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7304307222366333, + "num_tokens": 527680675.0, + "step": 20394 + }, + { + "epoch": 2.239732044805623, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4190831184387207, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7298380732536316, + "num_tokens": 527703669.0, + "step": 20395 + }, + { + "epoch": 2.239841862508236, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.373507499694824, + "learning_rate": 1e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6868393421173096, + "num_tokens": 527735082.0, + "step": 20396 + }, + { + "epoch": 2.23995168021085, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.421536445617676, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7291857004165649, + "num_tokens": 527759437.0, + "step": 20397 + }, + { + "epoch": 2.2400614979134637, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.290900468826294, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6904100179672241, + "num_tokens": 527789906.0, + "step": 20398 + }, + { + "epoch": 2.2401713156160774, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5257999897003174, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.728896975517273, + "num_tokens": 527812118.0, + "step": 20399 + }, + { + "epoch": 2.240281133318691, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.560807704925537, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7049661874771118, + "num_tokens": 527837353.0, + "step": 20400 + }, + { + "epoch": 2.2403909510213045, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.142934560775757, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.69852614402771, + "num_tokens": 527868683.0, + "step": 20401 + }, + { + "epoch": 2.2405007687239182, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.531261444091797, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7269929647445679, + "num_tokens": 527892039.0, + "step": 20402 + }, + { + "epoch": 2.240610586426532, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.44852352142334, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7087832689285278, + "num_tokens": 527918095.0, + "step": 20403 + }, + { + "epoch": 2.2407204041291457, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7510030269622803, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7027117609977722, + "num_tokens": 527938217.0, + "step": 20404 + }, + { + "epoch": 2.240830221831759, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3576738834381104, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7080891728401184, + "num_tokens": 527965025.0, + "step": 20405 + }, + { + "epoch": 2.240940039534373, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5726447105407715, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6975637674331665, + "num_tokens": 527989612.0, + "step": 20406 + }, + { + "epoch": 2.2410498572369866, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7792775630950928, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7395471930503845, + "num_tokens": 528009851.0, + "step": 20407 + }, + { + "epoch": 2.2411596749396003, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1252591609954834, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6984527111053467, + "num_tokens": 528043087.0, + "step": 20408 + }, + { + "epoch": 2.241269492642214, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.352409601211548, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.69499272108078, + "num_tokens": 528072194.0, + "step": 20409 + }, + { + "epoch": 2.2413793103448274, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.423556089401245, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6987850069999695, + "num_tokens": 528098834.0, + "step": 20410 + }, + { + "epoch": 2.241489128047441, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5771121978759766, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7254876494407654, + "num_tokens": 528122187.0, + "step": 20411 + }, + { + "epoch": 2.241598945750055, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.167539596557617, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6951675415039062, + "num_tokens": 528154044.0, + "step": 20412 + }, + { + "epoch": 2.2417087634526687, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5655105113983154, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7133588790893555, + "num_tokens": 528175938.0, + "step": 20413 + }, + { + "epoch": 2.2418185811552824, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.396979808807373, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6898068189620972, + "num_tokens": 528204258.0, + "step": 20414 + }, + { + "epoch": 2.2419283988578957, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2989614009857178, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7053559422492981, + "num_tokens": 528237244.0, + "step": 20415 + }, + { + "epoch": 2.2420382165605095, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4659368991851807, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.7019481658935547, + "num_tokens": 528264249.0, + "step": 20416 + }, + { + "epoch": 2.2421480342631233, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7860095500946045, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7199273109436035, + "num_tokens": 528284839.0, + "step": 20417 + }, + { + "epoch": 2.242257851965737, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5299856662750244, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7017571926116943, + "num_tokens": 528310191.0, + "step": 20418 + }, + { + "epoch": 2.2423676696683508, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3870785236358643, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7133793234825134, + "num_tokens": 528338473.0, + "step": 20419 + }, + { + "epoch": 2.242477487370964, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3816826343536377, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7100768685340881, + "num_tokens": 528368942.0, + "step": 20420 + }, + { + "epoch": 2.242587305073578, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.554203748703003, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7024469375610352, + "num_tokens": 528394289.0, + "step": 20421 + }, + { + "epoch": 2.2426971227761916, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.473127841949463, + "learning_rate": 1e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.6802012324333191, + "num_tokens": 528424723.0, + "step": 20422 + }, + { + "epoch": 2.2428069404788054, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4538819789886475, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6891015768051147, + "num_tokens": 528451787.0, + "step": 20423 + }, + { + "epoch": 2.2429167581814187, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2690823078155518, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6973254084587097, + "num_tokens": 528484557.0, + "step": 20424 + }, + { + "epoch": 2.2430265758840324, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.540567636489868, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7017812728881836, + "num_tokens": 528511132.0, + "step": 20425 + }, + { + "epoch": 2.243136393586646, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.389228343963623, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7138863205909729, + "num_tokens": 528539163.0, + "step": 20426 + }, + { + "epoch": 2.24324621128926, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.290051221847534, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7233066558837891, + "num_tokens": 528567797.0, + "step": 20427 + }, + { + "epoch": 2.2433560289918737, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3207991123199463, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.7003792524337769, + "num_tokens": 528595182.0, + "step": 20428 + }, + { + "epoch": 2.243465846694487, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.473013162612915, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7267258763313293, + "num_tokens": 528620815.0, + "step": 20429 + }, + { + "epoch": 2.2435756643971008, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.138998508453369, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.7019148468971252, + "num_tokens": 528653451.0, + "step": 20430 + }, + { + "epoch": 2.2436854820997145, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2311692237854004, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.704458475112915, + "num_tokens": 528682954.0, + "step": 20431 + }, + { + "epoch": 2.2437952998023283, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.360535144805908, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6962637305259705, + "num_tokens": 528713588.0, + "step": 20432 + }, + { + "epoch": 2.2439051175049416, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.365058660507202, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7291796207427979, + "num_tokens": 528738256.0, + "step": 20433 + }, + { + "epoch": 2.2440149352075554, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3837273120880127, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7169173955917358, + "num_tokens": 528764360.0, + "step": 20434 + }, + { + "epoch": 2.244124752910169, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.445321798324585, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.711340069770813, + "num_tokens": 528791968.0, + "step": 20435 + }, + { + "epoch": 2.244234570612783, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4040329456329346, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6994979381561279, + "num_tokens": 528818306.0, + "step": 20436 + }, + { + "epoch": 2.2443443883153966, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5081348419189453, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7273797988891602, + "num_tokens": 528844044.0, + "step": 20437 + }, + { + "epoch": 2.24445420601801, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.422804117202759, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7359530329704285, + "num_tokens": 528869218.0, + "step": 20438 + }, + { + "epoch": 2.2445640237206237, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.391711711883545, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7258650064468384, + "num_tokens": 528895694.0, + "step": 20439 + }, + { + "epoch": 2.2446738414232374, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5022668838500977, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6995177268981934, + "num_tokens": 528924203.0, + "step": 20440 + }, + { + "epoch": 2.244783659125851, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4985673427581787, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7172894477844238, + "num_tokens": 528948436.0, + "step": 20441 + }, + { + "epoch": 2.244893476828465, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 7.327902317047119, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7297718524932861, + "num_tokens": 528969233.0, + "step": 20442 + }, + { + "epoch": 2.2450032945310783, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4785478115081787, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7433754801750183, + "num_tokens": 528993458.0, + "step": 20443 + }, + { + "epoch": 2.245113112233692, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.551957845687866, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7198972702026367, + "num_tokens": 529019352.0, + "step": 20444 + }, + { + "epoch": 2.245222929936306, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5789713859558105, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.707091212272644, + "num_tokens": 529043241.0, + "step": 20445 + }, + { + "epoch": 2.2453327476389195, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3856797218322754, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7040317058563232, + "num_tokens": 529071459.0, + "step": 20446 + }, + { + "epoch": 2.245442565341533, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4169209003448486, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7128382921218872, + "num_tokens": 529097966.0, + "step": 20447 + }, + { + "epoch": 2.2455523830441466, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4853854179382324, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.703801155090332, + "num_tokens": 529123413.0, + "step": 20448 + }, + { + "epoch": 2.2456622007467604, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4564731121063232, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6937853097915649, + "num_tokens": 529151037.0, + "step": 20449 + }, + { + "epoch": 2.245772018449374, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6269843578338623, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7167407274246216, + "num_tokens": 529173773.0, + "step": 20450 + }, + { + "epoch": 2.245881836151988, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.629868745803833, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7342604398727417, + "num_tokens": 529195203.0, + "step": 20451 + }, + { + "epoch": 2.245991653854601, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.522430419921875, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7320753335952759, + "num_tokens": 529218317.0, + "step": 20452 + }, + { + "epoch": 2.246101471557215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4809348583221436, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7291969060897827, + "num_tokens": 529242615.0, + "step": 20453 + }, + { + "epoch": 2.2462112892598287, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3873050212860107, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7132824659347534, + "num_tokens": 529268984.0, + "step": 20454 + }, + { + "epoch": 2.2463211069624425, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4463114738464355, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7221612334251404, + "num_tokens": 529293638.0, + "step": 20455 + }, + { + "epoch": 2.246430924665056, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.563121795654297, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7181919813156128, + "num_tokens": 529317290.0, + "step": 20456 + }, + { + "epoch": 2.2465407423676695, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8379034996032715, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7287893295288086, + "num_tokens": 529336431.0, + "step": 20457 + }, + { + "epoch": 2.2466505600702833, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.201504945755005, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6904996633529663, + "num_tokens": 529366266.0, + "step": 20458 + }, + { + "epoch": 2.246760377772897, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.848121166229248, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7334434390068054, + "num_tokens": 529383851.0, + "step": 20459 + }, + { + "epoch": 2.246870195475511, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5501797199249268, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7174994349479675, + "num_tokens": 529409177.0, + "step": 20460 + }, + { + "epoch": 2.246980013178124, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.277759313583374, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7128402590751648, + "num_tokens": 529439888.0, + "step": 20461 + }, + { + "epoch": 2.247089830880738, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8128695487976074, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7350585460662842, + "num_tokens": 529462820.0, + "step": 20462 + }, + { + "epoch": 2.2471996485833516, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6583783626556396, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.718758225440979, + "num_tokens": 529486913.0, + "step": 20463 + }, + { + "epoch": 2.2473094662859654, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3656716346740723, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7235085964202881, + "num_tokens": 529513992.0, + "step": 20464 + }, + { + "epoch": 2.247419283988579, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4410455226898193, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7239307761192322, + "num_tokens": 529540114.0, + "step": 20465 + }, + { + "epoch": 2.2475291016911925, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2966127395629883, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.72054123878479, + "num_tokens": 529566298.0, + "step": 20466 + }, + { + "epoch": 2.2476389193938062, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.432429552078247, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7131339907646179, + "num_tokens": 529591700.0, + "step": 20467 + }, + { + "epoch": 2.24774873709642, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7291171550750732, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.753449559211731, + "num_tokens": 529612780.0, + "step": 20468 + }, + { + "epoch": 2.2478585547990337, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2964611053466797, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7018382549285889, + "num_tokens": 529641782.0, + "step": 20469 + }, + { + "epoch": 2.2479683725016475, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.396466016769409, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7159324884414673, + "num_tokens": 529668437.0, + "step": 20470 + }, + { + "epoch": 2.248078190204261, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5603954792022705, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6998001337051392, + "num_tokens": 529692229.0, + "step": 20471 + }, + { + "epoch": 2.2481880079068746, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.169997215270996, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7194291353225708, + "num_tokens": 529722955.0, + "step": 20472 + }, + { + "epoch": 2.2482978256094883, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3819215297698975, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7221290469169617, + "num_tokens": 529750229.0, + "step": 20473 + }, + { + "epoch": 2.248407643312102, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3921666145324707, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.706078290939331, + "num_tokens": 529777962.0, + "step": 20474 + }, + { + "epoch": 2.2485174610147154, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3685200214385986, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7148890495300293, + "num_tokens": 529805968.0, + "step": 20475 + }, + { + "epoch": 2.248627278717329, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.566721200942993, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.736752450466156, + "num_tokens": 529828306.0, + "step": 20476 + }, + { + "epoch": 2.248737096419943, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4007210731506348, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.707229495048523, + "num_tokens": 529855477.0, + "step": 20477 + }, + { + "epoch": 2.2488469141225567, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5252525806427, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7333567142486572, + "num_tokens": 529879914.0, + "step": 20478 + }, + { + "epoch": 2.2489567318251704, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.53562593460083, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7237291932106018, + "num_tokens": 529902911.0, + "step": 20479 + }, + { + "epoch": 2.2490665495277837, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.130065679550171, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6927739381790161, + "num_tokens": 529936411.0, + "step": 20480 + }, + { + "epoch": 2.2491763672303975, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5800893306732178, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.718596875667572, + "num_tokens": 529957626.0, + "step": 20481 + }, + { + "epoch": 2.2492861849330112, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.635131597518921, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7105141282081604, + "num_tokens": 529981233.0, + "step": 20482 + }, + { + "epoch": 2.249396002635625, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.386639356613159, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7182171940803528, + "num_tokens": 530008794.0, + "step": 20483 + }, + { + "epoch": 2.2495058203382383, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3474152088165283, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7065039277076721, + "num_tokens": 530036435.0, + "step": 20484 + }, + { + "epoch": 2.249615638040852, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5022130012512207, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.720726490020752, + "num_tokens": 530060470.0, + "step": 20485 + }, + { + "epoch": 2.249725455743466, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2886834144592285, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.7064791917800903, + "num_tokens": 530090529.0, + "step": 20486 + }, + { + "epoch": 2.2498352734460796, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3727357387542725, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7192932367324829, + "num_tokens": 530117324.0, + "step": 20487 + }, + { + "epoch": 2.2499450911486933, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.777608633041382, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7214343547821045, + "num_tokens": 530139881.0, + "step": 20488 + }, + { + "epoch": 2.2500549088513067, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7321877479553223, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7434803247451782, + "num_tokens": 530159955.0, + "step": 20489 + }, + { + "epoch": 2.2501647265539204, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.515458583831787, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7096917629241943, + "num_tokens": 530183815.0, + "step": 20490 + }, + { + "epoch": 2.250274544256534, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.468428373336792, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7161740064620972, + "num_tokens": 530209864.0, + "step": 20491 + }, + { + "epoch": 2.250384361959148, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5076491832733154, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7256362438201904, + "num_tokens": 530236326.0, + "step": 20492 + }, + { + "epoch": 2.2504941796617617, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6956822872161865, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7157776355743408, + "num_tokens": 530256498.0, + "step": 20493 + }, + { + "epoch": 2.250603997364375, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4856176376342773, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7139386534690857, + "num_tokens": 530280928.0, + "step": 20494 + }, + { + "epoch": 2.2507138150669888, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.644482135772705, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7177011966705322, + "num_tokens": 530301881.0, + "step": 20495 + }, + { + "epoch": 2.2508236327696025, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.255295991897583, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.687577486038208, + "num_tokens": 530333726.0, + "step": 20496 + }, + { + "epoch": 2.2509334504722163, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4436872005462646, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7277287840843201, + "num_tokens": 530360414.0, + "step": 20497 + }, + { + "epoch": 2.25104326817483, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.337892532348633, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.697990894317627, + "num_tokens": 530387789.0, + "step": 20498 + }, + { + "epoch": 2.2511530858774433, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.578342914581299, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7138549089431763, + "num_tokens": 530410661.0, + "step": 20499 + }, + { + "epoch": 2.251262903580057, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.412061929702759, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7101317644119263, + "num_tokens": 530438675.0, + "step": 20500 + }, + { + "epoch": 2.251372721282671, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4077508449554443, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7151798009872437, + "num_tokens": 530466771.0, + "step": 20501 + }, + { + "epoch": 2.2514825389852846, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.544603109359741, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7485565543174744, + "num_tokens": 530489353.0, + "step": 20502 + }, + { + "epoch": 2.251592356687898, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6062610149383545, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.711150586605072, + "num_tokens": 530514382.0, + "step": 20503 + }, + { + "epoch": 2.2517021743905117, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4968619346618652, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7057559490203857, + "num_tokens": 530539874.0, + "step": 20504 + }, + { + "epoch": 2.2518119920931254, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6173133850097656, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.720697283744812, + "num_tokens": 530564468.0, + "step": 20505 + }, + { + "epoch": 2.251921809795739, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.265435218811035, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7287804484367371, + "num_tokens": 530592630.0, + "step": 20506 + }, + { + "epoch": 2.2520316274983525, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3653154373168945, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7054957151412964, + "num_tokens": 530621981.0, + "step": 20507 + }, + { + "epoch": 2.2521414452009663, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.613687753677368, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7196506857872009, + "num_tokens": 530646013.0, + "step": 20508 + }, + { + "epoch": 2.25225126290358, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.727497100830078, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7290288209915161, + "num_tokens": 530669801.0, + "step": 20509 + }, + { + "epoch": 2.2523610806061938, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4263951778411865, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7498653531074524, + "num_tokens": 530695794.0, + "step": 20510 + }, + { + "epoch": 2.2524708983088075, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2672243118286133, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7126930952072144, + "num_tokens": 530725142.0, + "step": 20511 + }, + { + "epoch": 2.252580716011421, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.231429100036621, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7244679927825928, + "num_tokens": 530754745.0, + "step": 20512 + }, + { + "epoch": 2.2526905337140346, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6826364994049072, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7137554287910461, + "num_tokens": 530778615.0, + "step": 20513 + }, + { + "epoch": 2.2528003514166484, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3549411296844482, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.724260687828064, + "num_tokens": 530806303.0, + "step": 20514 + }, + { + "epoch": 2.252910169119262, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.20884370803833, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7159411907196045, + "num_tokens": 530836596.0, + "step": 20515 + }, + { + "epoch": 2.253019986821876, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3853936195373535, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7114969491958618, + "num_tokens": 530862211.0, + "step": 20516 + }, + { + "epoch": 2.253129804524489, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5753633975982666, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6941065788269043, + "num_tokens": 530892342.0, + "step": 20517 + }, + { + "epoch": 2.253239622227103, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2869584560394287, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6875369548797607, + "num_tokens": 530924070.0, + "step": 20518 + }, + { + "epoch": 2.2533494399297167, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.373178005218506, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6891839504241943, + "num_tokens": 530953116.0, + "step": 20519 + }, + { + "epoch": 2.2534592576323305, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.285634994506836, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7160501480102539, + "num_tokens": 530981887.0, + "step": 20520 + }, + { + "epoch": 2.253569075334944, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.324728488922119, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.714897871017456, + "num_tokens": 531009953.0, + "step": 20521 + }, + { + "epoch": 2.2536788930375575, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1879465579986572, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7138189077377319, + "num_tokens": 531042200.0, + "step": 20522 + }, + { + "epoch": 2.2537887107401713, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2840793132781982, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7367812395095825, + "num_tokens": 531071165.0, + "step": 20523 + }, + { + "epoch": 2.253898528442785, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.501770257949829, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7278274297714233, + "num_tokens": 531095433.0, + "step": 20524 + }, + { + "epoch": 2.254008346145399, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5038044452667236, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7118042707443237, + "num_tokens": 531120481.0, + "step": 20525 + }, + { + "epoch": 2.254118163848012, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7549755573272705, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7269348502159119, + "num_tokens": 531143677.0, + "step": 20526 + }, + { + "epoch": 2.254227981550626, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3222429752349854, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7017030715942383, + "num_tokens": 531173405.0, + "step": 20527 + }, + { + "epoch": 2.2543377992532396, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.533453941345215, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6912915706634521, + "num_tokens": 531200292.0, + "step": 20528 + }, + { + "epoch": 2.2544476169558534, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.563171625137329, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.719136655330658, + "num_tokens": 531223427.0, + "step": 20529 + }, + { + "epoch": 2.2545574346584667, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.486389398574829, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7008058428764343, + "num_tokens": 531248597.0, + "step": 20530 + }, + { + "epoch": 2.2546672523610805, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.684586763381958, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7286121845245361, + "num_tokens": 531269512.0, + "step": 20531 + }, + { + "epoch": 2.254777070063694, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.280208110809326, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.7051229476928711, + "num_tokens": 531298178.0, + "step": 20532 + }, + { + "epoch": 2.254886887766308, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.353391408920288, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7219940423965454, + "num_tokens": 531325600.0, + "step": 20533 + }, + { + "epoch": 2.2549967054689217, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.404291868209839, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7116473913192749, + "num_tokens": 531351729.0, + "step": 20534 + }, + { + "epoch": 2.255106523171535, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5828428268432617, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7175973057746887, + "num_tokens": 531376139.0, + "step": 20535 + }, + { + "epoch": 2.255216340874149, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.52992582321167, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.706295371055603, + "num_tokens": 531402112.0, + "step": 20536 + }, + { + "epoch": 2.2553261585767626, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2908644676208496, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7056605815887451, + "num_tokens": 531431035.0, + "step": 20537 + }, + { + "epoch": 2.2554359762793763, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.289989471435547, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7100116014480591, + "num_tokens": 531459968.0, + "step": 20538 + }, + { + "epoch": 2.25554579398199, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.656628370285034, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7018427848815918, + "num_tokens": 531482442.0, + "step": 20539 + }, + { + "epoch": 2.2556556116846034, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.52648663520813, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7288709282875061, + "num_tokens": 531506980.0, + "step": 20540 + }, + { + "epoch": 2.255765429387217, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3542346954345703, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7208583354949951, + "num_tokens": 531536863.0, + "step": 20541 + }, + { + "epoch": 2.255875247089831, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4819982051849365, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7227562665939331, + "num_tokens": 531561662.0, + "step": 20542 + }, + { + "epoch": 2.2559850647924446, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6316733360290527, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7342419028282166, + "num_tokens": 531583732.0, + "step": 20543 + }, + { + "epoch": 2.2560948824950584, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.305074691772461, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7414470911026001, + "num_tokens": 531611244.0, + "step": 20544 + }, + { + "epoch": 2.2562047001976717, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8904545307159424, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7475448846817017, + "num_tokens": 531629118.0, + "step": 20545 + }, + { + "epoch": 2.2563145179002855, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3122618198394775, + "learning_rate": 1e-06, + "loss": 1.0988, + "mean_token_accuracy": 0.6797402501106262, + "num_tokens": 531660232.0, + "step": 20546 + }, + { + "epoch": 2.2564243356028992, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3234012126922607, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.721208393573761, + "num_tokens": 531691246.0, + "step": 20547 + }, + { + "epoch": 2.256534153305513, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1399903297424316, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6937581300735474, + "num_tokens": 531727836.0, + "step": 20548 + }, + { + "epoch": 2.2566439710081267, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.845362424850464, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7257713079452515, + "num_tokens": 531748360.0, + "step": 20549 + }, + { + "epoch": 2.25675378871074, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.224686861038208, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6810986995697021, + "num_tokens": 531782369.0, + "step": 20550 + }, + { + "epoch": 2.256863606413354, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.8893532752990723, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7011109590530396, + "num_tokens": 531803350.0, + "step": 20551 + }, + { + "epoch": 2.2569734241159676, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.446850538253784, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7129982113838196, + "num_tokens": 531830245.0, + "step": 20552 + }, + { + "epoch": 2.2570832418185813, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.279114246368408, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7259359955787659, + "num_tokens": 531857075.0, + "step": 20553 + }, + { + "epoch": 2.2571930595211946, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.184199094772339, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7049756050109863, + "num_tokens": 531887055.0, + "step": 20554 + }, + { + "epoch": 2.2573028772238084, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3918943405151367, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7046468257904053, + "num_tokens": 531914390.0, + "step": 20555 + }, + { + "epoch": 2.257412694926422, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.286421060562134, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7155994176864624, + "num_tokens": 531944376.0, + "step": 20556 + }, + { + "epoch": 2.257522512629036, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4028570652008057, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7165656089782715, + "num_tokens": 531970531.0, + "step": 20557 + }, + { + "epoch": 2.2576323303316492, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4114134311676025, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7228319644927979, + "num_tokens": 531997450.0, + "step": 20558 + }, + { + "epoch": 2.257742148034263, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3214240074157715, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7232619524002075, + "num_tokens": 532023100.0, + "step": 20559 + }, + { + "epoch": 2.2578519657368767, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3519463539123535, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7132998704910278, + "num_tokens": 532049239.0, + "step": 20560 + }, + { + "epoch": 2.2579617834394905, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.105541706085205, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7136092185974121, + "num_tokens": 532083272.0, + "step": 20561 + }, + { + "epoch": 2.2580716011421043, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3135738372802734, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7130427360534668, + "num_tokens": 532112550.0, + "step": 20562 + }, + { + "epoch": 2.2581814188447176, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2400920391082764, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7125072479248047, + "num_tokens": 532141991.0, + "step": 20563 + }, + { + "epoch": 2.2582912365473313, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.333911657333374, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7236674427986145, + "num_tokens": 532169669.0, + "step": 20564 + }, + { + "epoch": 2.258401054249945, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6515161991119385, + "learning_rate": 1e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7697252035140991, + "num_tokens": 532189423.0, + "step": 20565 + }, + { + "epoch": 2.258510871952559, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.527958869934082, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7158105373382568, + "num_tokens": 532213736.0, + "step": 20566 + }, + { + "epoch": 2.2586206896551726, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.229689598083496, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6936055421829224, + "num_tokens": 532242845.0, + "step": 20567 + }, + { + "epoch": 2.258730507357786, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3128461837768555, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7303565144538879, + "num_tokens": 532269830.0, + "step": 20568 + }, + { + "epoch": 2.2588403250603997, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4575273990631104, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.729627251625061, + "num_tokens": 532294985.0, + "step": 20569 + }, + { + "epoch": 2.2589501427630134, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3910696506500244, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7311300039291382, + "num_tokens": 532322489.0, + "step": 20570 + }, + { + "epoch": 2.259059960465627, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.568530559539795, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.712005078792572, + "num_tokens": 532346418.0, + "step": 20571 + }, + { + "epoch": 2.259169778168241, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.583850622177124, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7332168817520142, + "num_tokens": 532368992.0, + "step": 20572 + }, + { + "epoch": 2.2592795958708543, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5965335369110107, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7017012238502502, + "num_tokens": 532394273.0, + "step": 20573 + }, + { + "epoch": 2.259389413573468, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.713027000427246, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7094403505325317, + "num_tokens": 532416996.0, + "step": 20574 + }, + { + "epoch": 2.2594992312760818, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5483906269073486, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.718102216720581, + "num_tokens": 532442497.0, + "step": 20575 + }, + { + "epoch": 2.2596090489786955, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.490513324737549, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7056388854980469, + "num_tokens": 532469051.0, + "step": 20576 + }, + { + "epoch": 2.259718866681309, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.651473045349121, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7135706543922424, + "num_tokens": 532495620.0, + "step": 20577 + }, + { + "epoch": 2.2598286843839226, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.660334587097168, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7242236733436584, + "num_tokens": 532518777.0, + "step": 20578 + }, + { + "epoch": 2.2599385020865363, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.647183895111084, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7275469303131104, + "num_tokens": 532543082.0, + "step": 20579 + }, + { + "epoch": 2.26004831978915, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2411367893218994, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7093092203140259, + "num_tokens": 532571941.0, + "step": 20580 + }, + { + "epoch": 2.260158137491764, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3790645599365234, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7197628021240234, + "num_tokens": 532596864.0, + "step": 20581 + }, + { + "epoch": 2.260267955194377, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3003313541412354, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7021158337593079, + "num_tokens": 532624412.0, + "step": 20582 + }, + { + "epoch": 2.260377772896991, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5850391387939453, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7369228005409241, + "num_tokens": 532645362.0, + "step": 20583 + }, + { + "epoch": 2.2604875905996047, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7236478328704834, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.718830943107605, + "num_tokens": 532665259.0, + "step": 20584 + }, + { + "epoch": 2.2605974083022184, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.667750597000122, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7026791572570801, + "num_tokens": 532688492.0, + "step": 20585 + }, + { + "epoch": 2.2607072260048318, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6894757747650146, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7257119417190552, + "num_tokens": 532712808.0, + "step": 20586 + }, + { + "epoch": 2.2608170437074455, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3539795875549316, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7263896465301514, + "num_tokens": 532741960.0, + "step": 20587 + }, + { + "epoch": 2.2609268614100593, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1092684268951416, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7093254327774048, + "num_tokens": 532775010.0, + "step": 20588 + }, + { + "epoch": 2.261036679112673, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.508051633834839, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7136080265045166, + "num_tokens": 532799610.0, + "step": 20589 + }, + { + "epoch": 2.261146496815287, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.511380910873413, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7226053476333618, + "num_tokens": 532822318.0, + "step": 20590 + }, + { + "epoch": 2.2612563145179, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3669724464416504, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7258599996566772, + "num_tokens": 532848644.0, + "step": 20591 + }, + { + "epoch": 2.261366132220514, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5421836376190186, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7040019035339355, + "num_tokens": 532874857.0, + "step": 20592 + }, + { + "epoch": 2.2614759499231276, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4799721240997314, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.71793532371521, + "num_tokens": 532901140.0, + "step": 20593 + }, + { + "epoch": 2.2615857676257414, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.283149242401123, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.6821286678314209, + "num_tokens": 532933633.0, + "step": 20594 + }, + { + "epoch": 2.261695585328355, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3594164848327637, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7042149901390076, + "num_tokens": 532960356.0, + "step": 20595 + }, + { + "epoch": 2.2618054030309684, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7266042232513428, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7047219276428223, + "num_tokens": 532982102.0, + "step": 20596 + }, + { + "epoch": 2.261915220733582, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4362781047821045, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7250715494155884, + "num_tokens": 533008657.0, + "step": 20597 + }, + { + "epoch": 2.262025038436196, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2435381412506104, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.7000547647476196, + "num_tokens": 533040050.0, + "step": 20598 + }, + { + "epoch": 2.2621348561388097, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6732969284057617, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6983692646026611, + "num_tokens": 533065654.0, + "step": 20599 + }, + { + "epoch": 2.2622446738414235, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5459258556365967, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.730448842048645, + "num_tokens": 533089382.0, + "step": 20600 + }, + { + "epoch": 2.262354491544037, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.312661647796631, + "learning_rate": 1e-06, + "loss": 1.1798, + "mean_token_accuracy": 0.6824021339416504, + "num_tokens": 533119669.0, + "step": 20601 + }, + { + "epoch": 2.2624643092466505, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.775334358215332, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7600386142730713, + "num_tokens": 533138406.0, + "step": 20602 + }, + { + "epoch": 2.2625741269492643, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3624112606048584, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7234123349189758, + "num_tokens": 533165108.0, + "step": 20603 + }, + { + "epoch": 2.262683944651878, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2103829383850098, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7147102355957031, + "num_tokens": 533196382.0, + "step": 20604 + }, + { + "epoch": 2.2627937623544914, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5428950786590576, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7176330089569092, + "num_tokens": 533221981.0, + "step": 20605 + }, + { + "epoch": 2.262903580057105, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6434032917022705, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6998265981674194, + "num_tokens": 533248532.0, + "step": 20606 + }, + { + "epoch": 2.263013397759719, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2315094470977783, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7074176073074341, + "num_tokens": 533276848.0, + "step": 20607 + }, + { + "epoch": 2.2631232154623326, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.243936061859131, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6898502111434937, + "num_tokens": 533309991.0, + "step": 20608 + }, + { + "epoch": 2.263233033164946, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.278452157974243, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.728848934173584, + "num_tokens": 533337717.0, + "step": 20609 + }, + { + "epoch": 2.2633428508675597, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.454512119293213, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7113376259803772, + "num_tokens": 533361471.0, + "step": 20610 + }, + { + "epoch": 2.2634526685701735, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.289076566696167, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7206066846847534, + "num_tokens": 533390149.0, + "step": 20611 + }, + { + "epoch": 2.263562486272787, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6204276084899902, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7475422620773315, + "num_tokens": 533412367.0, + "step": 20612 + }, + { + "epoch": 2.263672303975401, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5011298656463623, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7191447019577026, + "num_tokens": 533439118.0, + "step": 20613 + }, + { + "epoch": 2.2637821216780143, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.615513324737549, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.701252818107605, + "num_tokens": 533465411.0, + "step": 20614 + }, + { + "epoch": 2.263891939380628, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6320464611053467, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7106947898864746, + "num_tokens": 533491447.0, + "step": 20615 + }, + { + "epoch": 2.264001757083242, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7249531745910645, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7283328771591187, + "num_tokens": 533513943.0, + "step": 20616 + }, + { + "epoch": 2.2641115747858556, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4704830646514893, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6967589855194092, + "num_tokens": 533541238.0, + "step": 20617 + }, + { + "epoch": 2.2642213924884693, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.390611171722412, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7275624871253967, + "num_tokens": 533565011.0, + "step": 20618 + }, + { + "epoch": 2.2643312101910826, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.0870187282562256, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7265684604644775, + "num_tokens": 533598006.0, + "step": 20619 + }, + { + "epoch": 2.2644410278936964, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.549839973449707, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6925234794616699, + "num_tokens": 533623268.0, + "step": 20620 + }, + { + "epoch": 2.26455084559631, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2341091632843018, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7167655825614929, + "num_tokens": 533652105.0, + "step": 20621 + }, + { + "epoch": 2.264660663298924, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9366507530212402, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7206982374191284, + "num_tokens": 533670980.0, + "step": 20622 + }, + { + "epoch": 2.2647704810015377, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.550527334213257, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7083829641342163, + "num_tokens": 533697461.0, + "step": 20623 + }, + { + "epoch": 2.264880298704151, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7196645736694336, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7117425203323364, + "num_tokens": 533723770.0, + "step": 20624 + }, + { + "epoch": 2.2649901164067647, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.190748691558838, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7250874638557434, + "num_tokens": 533752403.0, + "step": 20625 + }, + { + "epoch": 2.2650999341093785, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5303218364715576, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7225643396377563, + "num_tokens": 533777072.0, + "step": 20626 + }, + { + "epoch": 2.2652097518119922, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6470022201538086, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7286663055419922, + "num_tokens": 533800909.0, + "step": 20627 + }, + { + "epoch": 2.265319569514606, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5006775856018066, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7388326525688171, + "num_tokens": 533825942.0, + "step": 20628 + }, + { + "epoch": 2.2654293872172193, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4129810333251953, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7321501970291138, + "num_tokens": 533851246.0, + "step": 20629 + }, + { + "epoch": 2.265539204919833, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.767841100692749, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7328242659568787, + "num_tokens": 533871288.0, + "step": 20630 + }, + { + "epoch": 2.265649022622447, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.443016767501831, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7032623887062073, + "num_tokens": 533897718.0, + "step": 20631 + }, + { + "epoch": 2.2657588403250606, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.716660261154175, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7237242460250854, + "num_tokens": 533919924.0, + "step": 20632 + }, + { + "epoch": 2.265868658027674, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4562182426452637, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7070444822311401, + "num_tokens": 533945249.0, + "step": 20633 + }, + { + "epoch": 2.2659784757302877, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4210963249206543, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6940090656280518, + "num_tokens": 533973516.0, + "step": 20634 + }, + { + "epoch": 2.2660882934329014, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5594396591186523, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7305443286895752, + "num_tokens": 533997573.0, + "step": 20635 + }, + { + "epoch": 2.266198111135515, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.884302854537964, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7420170903205872, + "num_tokens": 534016343.0, + "step": 20636 + }, + { + "epoch": 2.2663079288381285, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3218812942504883, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7082597017288208, + "num_tokens": 534046554.0, + "step": 20637 + }, + { + "epoch": 2.2664177465407422, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.60233736038208, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7226959466934204, + "num_tokens": 534070506.0, + "step": 20638 + }, + { + "epoch": 2.266527564243356, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3673298358917236, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7000790238380432, + "num_tokens": 534098458.0, + "step": 20639 + }, + { + "epoch": 2.2666373819459698, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.741807460784912, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7309942245483398, + "num_tokens": 534119676.0, + "step": 20640 + }, + { + "epoch": 2.2667471996485835, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.402902841567993, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7384793162345886, + "num_tokens": 534147125.0, + "step": 20641 + }, + { + "epoch": 2.266857017351197, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4722907543182373, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6800680160522461, + "num_tokens": 534175831.0, + "step": 20642 + }, + { + "epoch": 2.2669668350538106, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5221588611602783, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7314900755882263, + "num_tokens": 534202981.0, + "step": 20643 + }, + { + "epoch": 2.2670766527564243, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4834389686584473, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6999431848526001, + "num_tokens": 534232202.0, + "step": 20644 + }, + { + "epoch": 2.267186470459038, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4839601516723633, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7346863150596619, + "num_tokens": 534256887.0, + "step": 20645 + }, + { + "epoch": 2.267296288161652, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.310398817062378, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7304298281669617, + "num_tokens": 534283370.0, + "step": 20646 + }, + { + "epoch": 2.267406105864265, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.476851463317871, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7349380254745483, + "num_tokens": 534305855.0, + "step": 20647 + }, + { + "epoch": 2.267515923566879, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.93013858795166, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7149872183799744, + "num_tokens": 534324343.0, + "step": 20648 + }, + { + "epoch": 2.2676257412694927, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.675384521484375, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7277325391769409, + "num_tokens": 534349733.0, + "step": 20649 + }, + { + "epoch": 2.2677355589721064, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5942275524139404, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7055346965789795, + "num_tokens": 534376612.0, + "step": 20650 + }, + { + "epoch": 2.26784537667472, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3469202518463135, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7188922762870789, + "num_tokens": 534405067.0, + "step": 20651 + }, + { + "epoch": 2.2679551943773335, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.282790422439575, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6881853938102722, + "num_tokens": 534434465.0, + "step": 20652 + }, + { + "epoch": 2.2680650120799473, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5974056720733643, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7071149945259094, + "num_tokens": 534457313.0, + "step": 20653 + }, + { + "epoch": 2.268174829782561, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.553713798522949, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7274283170700073, + "num_tokens": 534483692.0, + "step": 20654 + }, + { + "epoch": 2.2682846474851748, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.671231508255005, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7112385034561157, + "num_tokens": 534506119.0, + "step": 20655 + }, + { + "epoch": 2.268394465187788, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2495973110198975, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6919212341308594, + "num_tokens": 534539490.0, + "step": 20656 + }, + { + "epoch": 2.268504282890402, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3793978691101074, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7426131963729858, + "num_tokens": 534566135.0, + "step": 20657 + }, + { + "epoch": 2.2686141005930156, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5169146060943604, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7293647527694702, + "num_tokens": 534591151.0, + "step": 20658 + }, + { + "epoch": 2.2687239182956294, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.234172821044922, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7117221355438232, + "num_tokens": 534620416.0, + "step": 20659 + }, + { + "epoch": 2.2688337359982427, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.607842206954956, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7251213192939758, + "num_tokens": 534643272.0, + "step": 20660 + }, + { + "epoch": 2.2689435537008564, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2719593048095703, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.699782133102417, + "num_tokens": 534674244.0, + "step": 20661 + }, + { + "epoch": 2.26905337140347, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.203003168106079, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7300592660903931, + "num_tokens": 534702621.0, + "step": 20662 + }, + { + "epoch": 2.269163189106084, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.667139768600464, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7209527492523193, + "num_tokens": 534724554.0, + "step": 20663 + }, + { + "epoch": 2.2692730068086977, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2850842475891113, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.7002825736999512, + "num_tokens": 534754776.0, + "step": 20664 + }, + { + "epoch": 2.269382824511311, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.413475275039673, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7160025835037231, + "num_tokens": 534781638.0, + "step": 20665 + }, + { + "epoch": 2.2694926422139248, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7516002655029297, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7320973873138428, + "num_tokens": 534800711.0, + "step": 20666 + }, + { + "epoch": 2.2696024599165385, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.216310739517212, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6889004707336426, + "num_tokens": 534833402.0, + "step": 20667 + }, + { + "epoch": 2.2697122776191523, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.428513526916504, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7304016947746277, + "num_tokens": 534858057.0, + "step": 20668 + }, + { + "epoch": 2.269822095321766, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3500449657440186, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.693778395652771, + "num_tokens": 534884852.0, + "step": 20669 + }, + { + "epoch": 2.2699319130243794, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6680734157562256, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.721057653427124, + "num_tokens": 534906785.0, + "step": 20670 + }, + { + "epoch": 2.270041730726993, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.182633876800537, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7127854824066162, + "num_tokens": 534939871.0, + "step": 20671 + }, + { + "epoch": 2.270151548429607, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.912684679031372, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7404309511184692, + "num_tokens": 534957889.0, + "step": 20672 + }, + { + "epoch": 2.2702613661322206, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.27311635017395, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6991176009178162, + "num_tokens": 534988918.0, + "step": 20673 + }, + { + "epoch": 2.2703711838348344, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5123822689056396, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7030400037765503, + "num_tokens": 535016145.0, + "step": 20674 + }, + { + "epoch": 2.2704810015374477, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.718273162841797, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7302554845809937, + "num_tokens": 535040159.0, + "step": 20675 + }, + { + "epoch": 2.2705908192400615, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2532548904418945, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7330023646354675, + "num_tokens": 535068420.0, + "step": 20676 + }, + { + "epoch": 2.270700636942675, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.9200408458709717, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7153932452201843, + "num_tokens": 535086914.0, + "step": 20677 + }, + { + "epoch": 2.270810454645289, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6333279609680176, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7285563945770264, + "num_tokens": 535107860.0, + "step": 20678 + }, + { + "epoch": 2.2709202723479027, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.442458391189575, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7235637903213501, + "num_tokens": 535134240.0, + "step": 20679 + }, + { + "epoch": 2.271030090050516, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6431145668029785, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.69950270652771, + "num_tokens": 535159061.0, + "step": 20680 + }, + { + "epoch": 2.27113990775313, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.485034227371216, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7237958312034607, + "num_tokens": 535183824.0, + "step": 20681 + }, + { + "epoch": 2.2712497254557436, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.765143871307373, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7118308544158936, + "num_tokens": 535209253.0, + "step": 20682 + }, + { + "epoch": 2.2713595431583573, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.707486391067505, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7077962756156921, + "num_tokens": 535232310.0, + "step": 20683 + }, + { + "epoch": 2.2714693608609706, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6482396125793457, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7247329354286194, + "num_tokens": 535254604.0, + "step": 20684 + }, + { + "epoch": 2.2715791785635844, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2500853538513184, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7033014893531799, + "num_tokens": 535288578.0, + "step": 20685 + }, + { + "epoch": 2.271688996266198, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5701026916503906, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7061982154846191, + "num_tokens": 535313776.0, + "step": 20686 + }, + { + "epoch": 2.271798813968812, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.241671323776245, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.713270366191864, + "num_tokens": 535343650.0, + "step": 20687 + }, + { + "epoch": 2.271908631671425, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.813537359237671, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7299745082855225, + "num_tokens": 535363387.0, + "step": 20688 + }, + { + "epoch": 2.272018449374039, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7797257900238037, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7416285276412964, + "num_tokens": 535385358.0, + "step": 20689 + }, + { + "epoch": 2.2721282670766527, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4434492588043213, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6824349164962769, + "num_tokens": 535416079.0, + "step": 20690 + }, + { + "epoch": 2.2722380847792665, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4777956008911133, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7244276404380798, + "num_tokens": 535440675.0, + "step": 20691 + }, + { + "epoch": 2.2723479024818802, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4300472736358643, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7026064395904541, + "num_tokens": 535469567.0, + "step": 20692 + }, + { + "epoch": 2.2724577201844935, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.316216230392456, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6900043487548828, + "num_tokens": 535503782.0, + "step": 20693 + }, + { + "epoch": 2.2725675378871073, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.390773057937622, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7334535121917725, + "num_tokens": 535529052.0, + "step": 20694 + }, + { + "epoch": 2.272677355589721, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.811413288116455, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7336820960044861, + "num_tokens": 535550344.0, + "step": 20695 + }, + { + "epoch": 2.272787173292335, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5259430408477783, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7139883637428284, + "num_tokens": 535574011.0, + "step": 20696 + }, + { + "epoch": 2.2728969909949486, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3948893547058105, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7075573205947876, + "num_tokens": 535604376.0, + "step": 20697 + }, + { + "epoch": 2.273006808697562, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4223618507385254, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7094532251358032, + "num_tokens": 535633623.0, + "step": 20698 + }, + { + "epoch": 2.2731166264001756, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4431166648864746, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7085293531417847, + "num_tokens": 535658667.0, + "step": 20699 + }, + { + "epoch": 2.2732264441027894, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4959444999694824, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7465258836746216, + "num_tokens": 535682680.0, + "step": 20700 + }, + { + "epoch": 2.273336261805403, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7680532932281494, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7356598377227783, + "num_tokens": 535704776.0, + "step": 20701 + }, + { + "epoch": 2.273446079508017, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4861044883728027, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7113867998123169, + "num_tokens": 535731321.0, + "step": 20702 + }, + { + "epoch": 2.2735558972106302, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.898098945617676, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7368063926696777, + "num_tokens": 535749357.0, + "step": 20703 + }, + { + "epoch": 2.273665714913244, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2706358432769775, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7305953502655029, + "num_tokens": 535776414.0, + "step": 20704 + }, + { + "epoch": 2.2737755326158577, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.674760341644287, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.693756103515625, + "num_tokens": 535800583.0, + "step": 20705 + }, + { + "epoch": 2.2738853503184715, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.503296375274658, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7099993824958801, + "num_tokens": 535827948.0, + "step": 20706 + }, + { + "epoch": 2.273995168021085, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4013237953186035, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.733879804611206, + "num_tokens": 535855953.0, + "step": 20707 + }, + { + "epoch": 2.2741049857236986, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6501076221466064, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7343753576278687, + "num_tokens": 535878268.0, + "step": 20708 + }, + { + "epoch": 2.2742148034263123, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.37821102142334, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7149062156677246, + "num_tokens": 535906332.0, + "step": 20709 + }, + { + "epoch": 2.274324621128926, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6399569511413574, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7151502370834351, + "num_tokens": 535930170.0, + "step": 20710 + }, + { + "epoch": 2.2744344388315394, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.636204481124878, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7154397964477539, + "num_tokens": 535956811.0, + "step": 20711 + }, + { + "epoch": 2.274544256534153, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3439176082611084, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7164888381958008, + "num_tokens": 535987062.0, + "step": 20712 + }, + { + "epoch": 2.274654074236767, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.276289463043213, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7290809154510498, + "num_tokens": 536016727.0, + "step": 20713 + }, + { + "epoch": 2.2747638919393807, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.588634729385376, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7115364670753479, + "num_tokens": 536040280.0, + "step": 20714 + }, + { + "epoch": 2.2748737096419944, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3938727378845215, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7021700143814087, + "num_tokens": 536067056.0, + "step": 20715 + }, + { + "epoch": 2.2749835273446077, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7095565795898438, + "learning_rate": 1e-06, + "loss": 1.0753, + "mean_token_accuracy": 0.693372368812561, + "num_tokens": 536090216.0, + "step": 20716 + }, + { + "epoch": 2.2750933450472215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.537543773651123, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7258857488632202, + "num_tokens": 536113676.0, + "step": 20717 + }, + { + "epoch": 2.2752031627498353, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.354962110519409, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7273252010345459, + "num_tokens": 536140983.0, + "step": 20718 + }, + { + "epoch": 2.275312980452449, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.58402156829834, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.693548321723938, + "num_tokens": 536164566.0, + "step": 20719 + }, + { + "epoch": 2.2754227981550628, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4460320472717285, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7289160490036011, + "num_tokens": 536189222.0, + "step": 20720 + }, + { + "epoch": 2.275532615857676, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.872425079345703, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7281705141067505, + "num_tokens": 536207842.0, + "step": 20721 + }, + { + "epoch": 2.27564243356029, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.343414068222046, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7209773063659668, + "num_tokens": 536238776.0, + "step": 20722 + }, + { + "epoch": 2.2757522512629036, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.82063627243042, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7215428352355957, + "num_tokens": 536258384.0, + "step": 20723 + }, + { + "epoch": 2.2758620689655173, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2767302989959717, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7074126601219177, + "num_tokens": 536284965.0, + "step": 20724 + }, + { + "epoch": 2.275971886668131, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.331589460372925, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7199360132217407, + "num_tokens": 536313428.0, + "step": 20725 + }, + { + "epoch": 2.2760817043707444, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6819448471069336, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7007660269737244, + "num_tokens": 536339517.0, + "step": 20726 + }, + { + "epoch": 2.276191522073358, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.329800844192505, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7179859280586243, + "num_tokens": 536365847.0, + "step": 20727 + }, + { + "epoch": 2.276301339775972, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3375601768493652, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.69419264793396, + "num_tokens": 536393986.0, + "step": 20728 + }, + { + "epoch": 2.2764111574785857, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.354937791824341, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6926364898681641, + "num_tokens": 536422355.0, + "step": 20729 + }, + { + "epoch": 2.2765209751811994, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6217455863952637, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7109259963035583, + "num_tokens": 536447075.0, + "step": 20730 + }, + { + "epoch": 2.2766307928838128, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.434931755065918, + "learning_rate": 1e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6824161410331726, + "num_tokens": 536478409.0, + "step": 20731 + }, + { + "epoch": 2.2767406105864265, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.437150478363037, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7030340433120728, + "num_tokens": 536505080.0, + "step": 20732 + }, + { + "epoch": 2.2768504282890403, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.508336305618286, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7345291972160339, + "num_tokens": 536529626.0, + "step": 20733 + }, + { + "epoch": 2.276960245991654, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.382115602493286, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7318059206008911, + "num_tokens": 536556253.0, + "step": 20734 + }, + { + "epoch": 2.2770700636942673, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.6301865577697754, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7424010038375854, + "num_tokens": 536578185.0, + "step": 20735 + }, + { + "epoch": 2.277179881396881, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7986862659454346, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7104619741439819, + "num_tokens": 536601047.0, + "step": 20736 + }, + { + "epoch": 2.277289699099495, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2862820625305176, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7276685237884521, + "num_tokens": 536631078.0, + "step": 20737 + }, + { + "epoch": 2.2773995168021086, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 3.0434694290161133, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7097118496894836, + "num_tokens": 536649282.0, + "step": 20738 + }, + { + "epoch": 2.277509334504722, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.3946166038513184, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7012102603912354, + "num_tokens": 536676114.0, + "step": 20739 + }, + { + "epoch": 2.2776191522073357, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5797574520111084, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.724284827709198, + "num_tokens": 536701464.0, + "step": 20740 + }, + { + "epoch": 2.2777289699099494, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1733298301696777, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7057538032531738, + "num_tokens": 536733933.0, + "step": 20741 + }, + { + "epoch": 2.277838787612563, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.1572837829589844, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7259901165962219, + "num_tokens": 536764702.0, + "step": 20742 + }, + { + "epoch": 2.277948605315177, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.5102686882019043, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6997379660606384, + "num_tokens": 536789400.0, + "step": 20743 + }, + { + "epoch": 2.2780584230177903, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.7957992553710938, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7413318753242493, + "num_tokens": 536810565.0, + "step": 20744 + }, + { + "epoch": 2.278168240720404, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.4556519985198975, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7143163681030273, + "num_tokens": 536835174.0, + "step": 20745 + }, + { + "epoch": 2.278278058423018, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.467174530029297, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7086284160614014, + "num_tokens": 536860085.0, + "step": 20746 + }, + { + "epoch": 2.2783878761256315, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.623558282852173, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7156535983085632, + "num_tokens": 536883060.0, + "step": 20747 + }, + { + "epoch": 2.2784976938282453, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2584877014160156, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7097611427307129, + "num_tokens": 536912050.0, + "step": 20748 + }, + { + "epoch": 2.2786075115308586, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5885229110717773, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7248783707618713, + "num_tokens": 536934709.0, + "step": 20749 + }, + { + "epoch": 2.2787173292334724, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.271291732788086, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7213597893714905, + "num_tokens": 536963123.0, + "step": 20750 + }, + { + "epoch": 2.278827146936086, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.256775140762329, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.7012742757797241, + "num_tokens": 536994318.0, + "step": 20751 + }, + { + "epoch": 2.2789369646387, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.885599136352539, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7228434085845947, + "num_tokens": 537013036.0, + "step": 20752 + }, + { + "epoch": 2.2790467823413136, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4362094402313232, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6920679211616516, + "num_tokens": 537042769.0, + "step": 20753 + }, + { + "epoch": 2.279156600043927, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4154534339904785, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7213608622550964, + "num_tokens": 537070224.0, + "step": 20754 + }, + { + "epoch": 2.2792664177465407, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3483192920684814, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6866159439086914, + "num_tokens": 537100398.0, + "step": 20755 + }, + { + "epoch": 2.2793762354491545, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.406989097595215, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7219660878181458, + "num_tokens": 537124562.0, + "step": 20756 + }, + { + "epoch": 2.279486053151768, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.593062162399292, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7057267427444458, + "num_tokens": 537148606.0, + "step": 20757 + }, + { + "epoch": 2.2795958708543815, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3108327388763428, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7074615955352783, + "num_tokens": 537177375.0, + "step": 20758 + }, + { + "epoch": 2.2797056885569953, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.619957447052002, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7355267405509949, + "num_tokens": 537198936.0, + "step": 20759 + }, + { + "epoch": 2.279815506259609, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.642503261566162, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7348687648773193, + "num_tokens": 537222002.0, + "step": 20760 + }, + { + "epoch": 2.279925323962223, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.276604413986206, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.709247350692749, + "num_tokens": 537252011.0, + "step": 20761 + }, + { + "epoch": 2.2800351416648366, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5214953422546387, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.728469729423523, + "num_tokens": 537275960.0, + "step": 20762 + }, + { + "epoch": 2.28014495936745, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2612576484680176, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7189005613327026, + "num_tokens": 537306553.0, + "step": 20763 + }, + { + "epoch": 2.2802547770700636, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6523189544677734, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7254859805107117, + "num_tokens": 537329446.0, + "step": 20764 + }, + { + "epoch": 2.2803645947726774, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6644906997680664, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7219029664993286, + "num_tokens": 537352940.0, + "step": 20765 + }, + { + "epoch": 2.280474412475291, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4046432971954346, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7242122888565063, + "num_tokens": 537381267.0, + "step": 20766 + }, + { + "epoch": 2.2805842301779045, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5454180240631104, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7153639793395996, + "num_tokens": 537407946.0, + "step": 20767 + }, + { + "epoch": 2.280694047880518, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.767498254776001, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7336640357971191, + "num_tokens": 537426873.0, + "step": 20768 + }, + { + "epoch": 2.280803865583132, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4502153396606445, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7187533378601074, + "num_tokens": 537453441.0, + "step": 20769 + }, + { + "epoch": 2.2809136832857457, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.654266834259033, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7125062942504883, + "num_tokens": 537477762.0, + "step": 20770 + }, + { + "epoch": 2.2810235009883595, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4675180912017822, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7093180418014526, + "num_tokens": 537504972.0, + "step": 20771 + }, + { + "epoch": 2.281133318690973, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1917529106140137, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6882643103599548, + "num_tokens": 537538606.0, + "step": 20772 + }, + { + "epoch": 2.2812431363935866, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1980345249176025, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7048819065093994, + "num_tokens": 537570654.0, + "step": 20773 + }, + { + "epoch": 2.2813529540962003, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.386219024658203, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7107768058776855, + "num_tokens": 537597100.0, + "step": 20774 + }, + { + "epoch": 2.281462771798814, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.545306444168091, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7236769199371338, + "num_tokens": 537621795.0, + "step": 20775 + }, + { + "epoch": 2.281572589501428, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5353052616119385, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.719714343547821, + "num_tokens": 537645966.0, + "step": 20776 + }, + { + "epoch": 2.281682407204041, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2683725357055664, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6999390125274658, + "num_tokens": 537675897.0, + "step": 20777 + }, + { + "epoch": 2.281792224906655, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3496322631835938, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7193925976753235, + "num_tokens": 537703630.0, + "step": 20778 + }, + { + "epoch": 2.2819020426092687, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3792805671691895, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6854113936424255, + "num_tokens": 537731596.0, + "step": 20779 + }, + { + "epoch": 2.2820118603118824, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.401702404022217, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7119377851486206, + "num_tokens": 537759309.0, + "step": 20780 + }, + { + "epoch": 2.282121678014496, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6832809448242188, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7209911346435547, + "num_tokens": 537779453.0, + "step": 20781 + }, + { + "epoch": 2.2822314957171095, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5551185607910156, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7281793355941772, + "num_tokens": 537804432.0, + "step": 20782 + }, + { + "epoch": 2.2823413134197232, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2085962295532227, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7198028564453125, + "num_tokens": 537833522.0, + "step": 20783 + }, + { + "epoch": 2.282451131122337, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.272451639175415, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7115755081176758, + "num_tokens": 537861995.0, + "step": 20784 + }, + { + "epoch": 2.2825609488249508, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7307627201080322, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7337198257446289, + "num_tokens": 537883858.0, + "step": 20785 + }, + { + "epoch": 2.282670766527564, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.621399164199829, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6944007277488708, + "num_tokens": 537910691.0, + "step": 20786 + }, + { + "epoch": 2.282780584230178, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.8134217262268066, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7144197225570679, + "num_tokens": 537932131.0, + "step": 20787 + }, + { + "epoch": 2.2828904019327916, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.524177312850952, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6978751420974731, + "num_tokens": 537959397.0, + "step": 20788 + }, + { + "epoch": 2.2830002196354053, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7987987995147705, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7069690227508545, + "num_tokens": 537980465.0, + "step": 20789 + }, + { + "epoch": 2.2831100373380186, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1836633682250977, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7025663256645203, + "num_tokens": 538012996.0, + "step": 20790 + }, + { + "epoch": 2.2832198550406324, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3508336544036865, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6958029270172119, + "num_tokens": 538042730.0, + "step": 20791 + }, + { + "epoch": 2.283329672743246, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7783761024475098, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7190501689910889, + "num_tokens": 538064188.0, + "step": 20792 + }, + { + "epoch": 2.28343949044586, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.527460813522339, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7333180904388428, + "num_tokens": 538089270.0, + "step": 20793 + }, + { + "epoch": 2.2835493081484737, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3648345470428467, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7098820209503174, + "num_tokens": 538117966.0, + "step": 20794 + }, + { + "epoch": 2.283659125851087, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.303420305252075, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7294110059738159, + "num_tokens": 538146195.0, + "step": 20795 + }, + { + "epoch": 2.2837689435537007, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4393553733825684, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7174541354179382, + "num_tokens": 538171123.0, + "step": 20796 + }, + { + "epoch": 2.2838787612563145, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4964590072631836, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6961957216262817, + "num_tokens": 538195696.0, + "step": 20797 + }, + { + "epoch": 2.2839885789589283, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.253995418548584, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7114960551261902, + "num_tokens": 538224843.0, + "step": 20798 + }, + { + "epoch": 2.284098396661542, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.9604651927948, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7257675528526306, + "num_tokens": 538243669.0, + "step": 20799 + }, + { + "epoch": 2.2842082143641553, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.477541446685791, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.720977783203125, + "num_tokens": 538271782.0, + "step": 20800 + }, + { + "epoch": 2.284318032066769, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.511308193206787, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.711814820766449, + "num_tokens": 538295796.0, + "step": 20801 + }, + { + "epoch": 2.284427849769383, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.575070381164551, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7024992108345032, + "num_tokens": 538319690.0, + "step": 20802 + }, + { + "epoch": 2.2845376674719966, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4451563358306885, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7085731029510498, + "num_tokens": 538345376.0, + "step": 20803 + }, + { + "epoch": 2.2846474851746104, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5506386756896973, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7249548435211182, + "num_tokens": 538368600.0, + "step": 20804 + }, + { + "epoch": 2.2847573028772237, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3940658569335938, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7159658074378967, + "num_tokens": 538396941.0, + "step": 20805 + }, + { + "epoch": 2.2848671205798374, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.608240842819214, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6902114152908325, + "num_tokens": 538423386.0, + "step": 20806 + }, + { + "epoch": 2.284976938282451, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.51212477684021, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7080771923065186, + "num_tokens": 538449073.0, + "step": 20807 + }, + { + "epoch": 2.285086755985065, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1824283599853516, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.714523196220398, + "num_tokens": 538481914.0, + "step": 20808 + }, + { + "epoch": 2.2851965736876787, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.687899112701416, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7346091866493225, + "num_tokens": 538502236.0, + "step": 20809 + }, + { + "epoch": 2.285306391390292, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.780987501144409, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7149120569229126, + "num_tokens": 538522773.0, + "step": 20810 + }, + { + "epoch": 2.2854162090929058, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4836623668670654, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7213634252548218, + "num_tokens": 538547524.0, + "step": 20811 + }, + { + "epoch": 2.2855260267955195, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6628947257995605, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7292303442955017, + "num_tokens": 538569996.0, + "step": 20812 + }, + { + "epoch": 2.2856358444981333, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.290015697479248, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7099672555923462, + "num_tokens": 538597151.0, + "step": 20813 + }, + { + "epoch": 2.2857456622007466, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2613742351531982, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6943274140357971, + "num_tokens": 538627382.0, + "step": 20814 + }, + { + "epoch": 2.2858554799033604, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.503755569458008, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7134312391281128, + "num_tokens": 538652426.0, + "step": 20815 + }, + { + "epoch": 2.285965297605974, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.364353656768799, + "learning_rate": 1e-06, + "loss": 1.099, + "mean_token_accuracy": 0.675430178642273, + "num_tokens": 538681280.0, + "step": 20816 + }, + { + "epoch": 2.286075115308588, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3334693908691406, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.704943835735321, + "num_tokens": 538709258.0, + "step": 20817 + }, + { + "epoch": 2.286184933011201, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.613786458969116, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7179421186447144, + "num_tokens": 538734645.0, + "step": 20818 + }, + { + "epoch": 2.286294750713815, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.543348550796509, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.725013256072998, + "num_tokens": 538759285.0, + "step": 20819 + }, + { + "epoch": 2.2864045684164287, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.243767499923706, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6896877288818359, + "num_tokens": 538789988.0, + "step": 20820 + }, + { + "epoch": 2.2865143861190425, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.527783155441284, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7235866189002991, + "num_tokens": 538813693.0, + "step": 20821 + }, + { + "epoch": 2.286624203821656, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3983914852142334, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7280585765838623, + "num_tokens": 538839601.0, + "step": 20822 + }, + { + "epoch": 2.2867340215242695, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.199113130569458, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.712287187576294, + "num_tokens": 538869111.0, + "step": 20823 + }, + { + "epoch": 2.2868438392268833, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4041929244995117, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7127533555030823, + "num_tokens": 538895327.0, + "step": 20824 + }, + { + "epoch": 2.286953656929497, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.634643077850342, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6963934898376465, + "num_tokens": 538917551.0, + "step": 20825 + }, + { + "epoch": 2.287063474632111, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.279252529144287, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7034574151039124, + "num_tokens": 538948810.0, + "step": 20826 + }, + { + "epoch": 2.2871732923347246, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3891773223876953, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6932089328765869, + "num_tokens": 538977071.0, + "step": 20827 + }, + { + "epoch": 2.287283110037338, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.355565309524536, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7271079421043396, + "num_tokens": 539005221.0, + "step": 20828 + }, + { + "epoch": 2.2873929277399516, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.180346727371216, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7054426074028015, + "num_tokens": 539037391.0, + "step": 20829 + }, + { + "epoch": 2.2875027454425654, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 3.0620110034942627, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7226017117500305, + "num_tokens": 539054853.0, + "step": 20830 + }, + { + "epoch": 2.287612563145179, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.615668296813965, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7348861694335938, + "num_tokens": 539076794.0, + "step": 20831 + }, + { + "epoch": 2.287722380847793, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5390045642852783, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7498681545257568, + "num_tokens": 539099218.0, + "step": 20832 + }, + { + "epoch": 2.287832198550406, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3915534019470215, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7127553820610046, + "num_tokens": 539124379.0, + "step": 20833 + }, + { + "epoch": 2.28794201625302, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.459134101867676, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7091989517211914, + "num_tokens": 539150985.0, + "step": 20834 + }, + { + "epoch": 2.2880518339556337, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7392075061798096, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7270890474319458, + "num_tokens": 539171635.0, + "step": 20835 + }, + { + "epoch": 2.2881616516582475, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6149697303771973, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7252087593078613, + "num_tokens": 539195741.0, + "step": 20836 + }, + { + "epoch": 2.288271469360861, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.401744842529297, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.719986081123352, + "num_tokens": 539224786.0, + "step": 20837 + }, + { + "epoch": 2.2883812870634745, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.744619607925415, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6980505585670471, + "num_tokens": 539249631.0, + "step": 20838 + }, + { + "epoch": 2.2884911047660883, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.367370128631592, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7109432220458984, + "num_tokens": 539277615.0, + "step": 20839 + }, + { + "epoch": 2.288600922468702, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.603034019470215, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7163620591163635, + "num_tokens": 539300415.0, + "step": 20840 + }, + { + "epoch": 2.2887107401713154, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.498544692993164, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7258732914924622, + "num_tokens": 539324690.0, + "step": 20841 + }, + { + "epoch": 2.288820557873929, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2890918254852295, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7266438007354736, + "num_tokens": 539353522.0, + "step": 20842 + }, + { + "epoch": 2.288930375576543, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3432774543762207, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7175525426864624, + "num_tokens": 539380717.0, + "step": 20843 + }, + { + "epoch": 2.2890401932791566, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.553548574447632, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6919847130775452, + "num_tokens": 539406095.0, + "step": 20844 + }, + { + "epoch": 2.2891500109817704, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7756295204162598, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7242057919502258, + "num_tokens": 539428055.0, + "step": 20845 + }, + { + "epoch": 2.2892598286843837, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4113433361053467, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7296086549758911, + "num_tokens": 539453392.0, + "step": 20846 + }, + { + "epoch": 2.2893696463869975, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4088454246520996, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7205772399902344, + "num_tokens": 539478657.0, + "step": 20847 + }, + { + "epoch": 2.2894794640896112, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.449310064315796, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7189929485321045, + "num_tokens": 539503239.0, + "step": 20848 + }, + { + "epoch": 2.289589281792225, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.375945806503296, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7374475002288818, + "num_tokens": 539529013.0, + "step": 20849 + }, + { + "epoch": 2.2896990994948387, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.432438373565674, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7338049411773682, + "num_tokens": 539552767.0, + "step": 20850 + }, + { + "epoch": 2.289808917197452, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.409111976623535, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7397591471672058, + "num_tokens": 539579488.0, + "step": 20851 + }, + { + "epoch": 2.289918734900066, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3520772457122803, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7070073485374451, + "num_tokens": 539606583.0, + "step": 20852 + }, + { + "epoch": 2.2900285526026796, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4678843021392822, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.700788140296936, + "num_tokens": 539631989.0, + "step": 20853 + }, + { + "epoch": 2.2901383703052933, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.514253616333008, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7110991477966309, + "num_tokens": 539656172.0, + "step": 20854 + }, + { + "epoch": 2.290248188007907, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.326627492904663, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7078961133956909, + "num_tokens": 539684394.0, + "step": 20855 + }, + { + "epoch": 2.2903580057105204, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.398770570755005, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6904878616333008, + "num_tokens": 539712695.0, + "step": 20856 + }, + { + "epoch": 2.290467823413134, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.341336727142334, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6982976198196411, + "num_tokens": 539741458.0, + "step": 20857 + }, + { + "epoch": 2.290577641115748, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2885854244232178, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7123183608055115, + "num_tokens": 539770286.0, + "step": 20858 + }, + { + "epoch": 2.2906874588183617, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.551748752593994, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7012743353843689, + "num_tokens": 539797632.0, + "step": 20859 + }, + { + "epoch": 2.2907972765209754, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.0923612117767334, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7032876014709473, + "num_tokens": 539831806.0, + "step": 20860 + }, + { + "epoch": 2.2909070942235887, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.226768732070923, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7114231586456299, + "num_tokens": 539859437.0, + "step": 20861 + }, + { + "epoch": 2.2910169119262025, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6966376304626465, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7054008841514587, + "num_tokens": 539880540.0, + "step": 20862 + }, + { + "epoch": 2.2911267296288162, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.884580373764038, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7362639904022217, + "num_tokens": 539899973.0, + "step": 20863 + }, + { + "epoch": 2.29123654733143, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1316025257110596, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7084552049636841, + "num_tokens": 539934351.0, + "step": 20864 + }, + { + "epoch": 2.2913463650340433, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.649268388748169, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7291136980056763, + "num_tokens": 539956011.0, + "step": 20865 + }, + { + "epoch": 2.291456182736657, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6345129013061523, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7271332740783691, + "num_tokens": 539976916.0, + "step": 20866 + }, + { + "epoch": 2.291566000439271, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3675806522369385, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7018129825592041, + "num_tokens": 540002841.0, + "step": 20867 + }, + { + "epoch": 2.2916758181418846, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4805853366851807, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7140421271324158, + "num_tokens": 540028328.0, + "step": 20868 + }, + { + "epoch": 2.291785635844498, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3196613788604736, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7111124992370605, + "num_tokens": 540054231.0, + "step": 20869 + }, + { + "epoch": 2.2918954535471117, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7797188758850098, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7209385633468628, + "num_tokens": 540072985.0, + "step": 20870 + }, + { + "epoch": 2.2920052712497254, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6852047443389893, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7244817018508911, + "num_tokens": 540093628.0, + "step": 20871 + }, + { + "epoch": 2.292115088952339, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.69138765335083, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7168859243392944, + "num_tokens": 540115163.0, + "step": 20872 + }, + { + "epoch": 2.292224906654953, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.619952440261841, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7329764366149902, + "num_tokens": 540137017.0, + "step": 20873 + }, + { + "epoch": 2.2923347243575662, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7249016761779785, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7131432890892029, + "num_tokens": 540158223.0, + "step": 20874 + }, + { + "epoch": 2.29244454206018, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4786570072174072, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6996254920959473, + "num_tokens": 540184447.0, + "step": 20875 + }, + { + "epoch": 2.2925543597627938, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3046376705169678, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7060949802398682, + "num_tokens": 540214098.0, + "step": 20876 + }, + { + "epoch": 2.2926641774654075, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.568298578262329, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7509067058563232, + "num_tokens": 540236665.0, + "step": 20877 + }, + { + "epoch": 2.2927739951680213, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5648481845855713, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7310366630554199, + "num_tokens": 540262127.0, + "step": 20878 + }, + { + "epoch": 2.2928838128706346, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.701256513595581, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7206820845603943, + "num_tokens": 540286459.0, + "step": 20879 + }, + { + "epoch": 2.2929936305732483, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6448566913604736, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.708226203918457, + "num_tokens": 540308637.0, + "step": 20880 + }, + { + "epoch": 2.293103448275862, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6954562664031982, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7360184192657471, + "num_tokens": 540330801.0, + "step": 20881 + }, + { + "epoch": 2.293213265978476, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7521753311157227, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7424860596656799, + "num_tokens": 540350290.0, + "step": 20882 + }, + { + "epoch": 2.2933230836810896, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4851510524749756, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.72346031665802, + "num_tokens": 540375822.0, + "step": 20883 + }, + { + "epoch": 2.293432901383703, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.663094997406006, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.712099015712738, + "num_tokens": 540400247.0, + "step": 20884 + }, + { + "epoch": 2.2935427190863167, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.8167901039123535, + "learning_rate": 1e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7494627237319946, + "num_tokens": 540419036.0, + "step": 20885 + }, + { + "epoch": 2.2936525367889304, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.0976505279541016, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6831219792366028, + "num_tokens": 540454269.0, + "step": 20886 + }, + { + "epoch": 2.293762354491544, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.446662425994873, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7255663275718689, + "num_tokens": 540478961.0, + "step": 20887 + }, + { + "epoch": 2.2938721721941575, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.372863531112671, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.728690505027771, + "num_tokens": 540505132.0, + "step": 20888 + }, + { + "epoch": 2.2939819898967713, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 3.050074338912964, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7195733189582825, + "num_tokens": 540524365.0, + "step": 20889 + }, + { + "epoch": 2.294091807599385, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.497745990753174, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7088671922683716, + "num_tokens": 540551399.0, + "step": 20890 + }, + { + "epoch": 2.294201625301999, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.460172414779663, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.728752851486206, + "num_tokens": 540575806.0, + "step": 20891 + }, + { + "epoch": 2.294311443004612, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5707294940948486, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7162226438522339, + "num_tokens": 540599400.0, + "step": 20892 + }, + { + "epoch": 2.294421260707226, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.306338310241699, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7187898755073547, + "num_tokens": 540629135.0, + "step": 20893 + }, + { + "epoch": 2.2945310784098396, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2998411655426025, + "learning_rate": 1e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6851470470428467, + "num_tokens": 540659440.0, + "step": 20894 + }, + { + "epoch": 2.2946408961124534, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7055673599243164, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7320452928543091, + "num_tokens": 540680501.0, + "step": 20895 + }, + { + "epoch": 2.294750713815067, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5345418453216553, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7196261882781982, + "num_tokens": 540703039.0, + "step": 20896 + }, + { + "epoch": 2.2948605315176804, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.360811710357666, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7015864849090576, + "num_tokens": 540732665.0, + "step": 20897 + }, + { + "epoch": 2.294970349220294, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.423800468444824, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7095998525619507, + "num_tokens": 540759246.0, + "step": 20898 + }, + { + "epoch": 2.295080166922908, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3126323223114014, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.7005894184112549, + "num_tokens": 540788127.0, + "step": 20899 + }, + { + "epoch": 2.2951899846255217, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.9611737728118896, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7307727336883545, + "num_tokens": 540806931.0, + "step": 20900 + }, + { + "epoch": 2.2952998023281355, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.9041757583618164, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7169883847236633, + "num_tokens": 540826294.0, + "step": 20901 + }, + { + "epoch": 2.2954096200307488, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4424848556518555, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7096030712127686, + "num_tokens": 540851938.0, + "step": 20902 + }, + { + "epoch": 2.2955194377333625, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.400933027267456, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6963536739349365, + "num_tokens": 540879803.0, + "step": 20903 + }, + { + "epoch": 2.2956292554359763, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.304959774017334, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6950706243515015, + "num_tokens": 540908534.0, + "step": 20904 + }, + { + "epoch": 2.29573907313859, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5699238777160645, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7084476351737976, + "num_tokens": 540930373.0, + "step": 20905 + }, + { + "epoch": 2.295848890841204, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4369256496429443, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7249020934104919, + "num_tokens": 540954331.0, + "step": 20906 + }, + { + "epoch": 2.295958708543817, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6743977069854736, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7315352559089661, + "num_tokens": 540975078.0, + "step": 20907 + }, + { + "epoch": 2.296068526246431, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4719042778015137, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7021727561950684, + "num_tokens": 541000601.0, + "step": 20908 + }, + { + "epoch": 2.2961783439490446, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4135470390319824, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7057300806045532, + "num_tokens": 541028698.0, + "step": 20909 + }, + { + "epoch": 2.2962881616516584, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.188518762588501, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6911737322807312, + "num_tokens": 541060964.0, + "step": 20910 + }, + { + "epoch": 2.296397979354272, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.518018960952759, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7088788747787476, + "num_tokens": 541083273.0, + "step": 20911 + }, + { + "epoch": 2.2965077970568855, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.562875747680664, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6921299695968628, + "num_tokens": 541107607.0, + "step": 20912 + }, + { + "epoch": 2.296617614759499, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.492022752761841, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7153494954109192, + "num_tokens": 541131056.0, + "step": 20913 + }, + { + "epoch": 2.296727432462113, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2937307357788086, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7053157091140747, + "num_tokens": 541158773.0, + "step": 20914 + }, + { + "epoch": 2.2968372501647267, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 7.102932453155518, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7218291759490967, + "num_tokens": 541181050.0, + "step": 20915 + }, + { + "epoch": 2.29694706786734, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4603381156921387, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.709648847579956, + "num_tokens": 541210007.0, + "step": 20916 + }, + { + "epoch": 2.297056885569954, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.413952589035034, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.7048341631889343, + "num_tokens": 541235748.0, + "step": 20917 + }, + { + "epoch": 2.2971667032725676, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.270543098449707, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7102930545806885, + "num_tokens": 541267118.0, + "step": 20918 + }, + { + "epoch": 2.2972765209751813, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.508769989013672, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7193013429641724, + "num_tokens": 541290251.0, + "step": 20919 + }, + { + "epoch": 2.2973863386777946, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2229411602020264, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6965081095695496, + "num_tokens": 541321159.0, + "step": 20920 + }, + { + "epoch": 2.2974961563804084, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2827417850494385, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7038649916648865, + "num_tokens": 541348450.0, + "step": 20921 + }, + { + "epoch": 2.297605974083022, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.598553419113159, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7296354174613953, + "num_tokens": 541372135.0, + "step": 20922 + }, + { + "epoch": 2.297715791785636, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2849347591400146, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7062978148460388, + "num_tokens": 541401150.0, + "step": 20923 + }, + { + "epoch": 2.2978256094882497, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1107821464538574, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.7015456557273865, + "num_tokens": 541435158.0, + "step": 20924 + }, + { + "epoch": 2.297935427190863, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 3.0286648273468018, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7166221141815186, + "num_tokens": 541453374.0, + "step": 20925 + }, + { + "epoch": 2.2980452448934767, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4883339405059814, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7657731175422668, + "num_tokens": 541475660.0, + "step": 20926 + }, + { + "epoch": 2.2981550625960905, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4135985374450684, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7386763095855713, + "num_tokens": 541503522.0, + "step": 20927 + }, + { + "epoch": 2.2982648802987042, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.559415102005005, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.719902515411377, + "num_tokens": 541528360.0, + "step": 20928 + }, + { + "epoch": 2.298374698001318, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5308749675750732, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7017225027084351, + "num_tokens": 541554598.0, + "step": 20929 + }, + { + "epoch": 2.2984845157039313, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5357532501220703, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7403618097305298, + "num_tokens": 541577840.0, + "step": 20930 + }, + { + "epoch": 2.298594333406545, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3275723457336426, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6953635215759277, + "num_tokens": 541608864.0, + "step": 20931 + }, + { + "epoch": 2.298704151109159, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3527588844299316, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6882729530334473, + "num_tokens": 541636803.0, + "step": 20932 + }, + { + "epoch": 2.2988139688117726, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.329831123352051, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7121795415878296, + "num_tokens": 541664820.0, + "step": 20933 + }, + { + "epoch": 2.2989237865143863, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.475615978240967, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.708846926689148, + "num_tokens": 541690577.0, + "step": 20934 + }, + { + "epoch": 2.2990336042169996, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.370483875274658, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7141085267066956, + "num_tokens": 541716375.0, + "step": 20935 + }, + { + "epoch": 2.2991434219196134, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.43865966796875, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7282874584197998, + "num_tokens": 541742331.0, + "step": 20936 + }, + { + "epoch": 2.299253239622227, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3581342697143555, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7228339910507202, + "num_tokens": 541767319.0, + "step": 20937 + }, + { + "epoch": 2.299363057324841, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 7.030289173126221, + "learning_rate": 1e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6926828622817993, + "num_tokens": 541795125.0, + "step": 20938 + }, + { + "epoch": 2.2994728750274542, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.151818037033081, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6983777284622192, + "num_tokens": 541830697.0, + "step": 20939 + }, + { + "epoch": 2.299582692730068, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5419812202453613, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7220332622528076, + "num_tokens": 541856540.0, + "step": 20940 + }, + { + "epoch": 2.2996925104326817, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.9289822578430176, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7365531325340271, + "num_tokens": 541876615.0, + "step": 20941 + }, + { + "epoch": 2.2998023281352955, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.238468647003174, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7094128131866455, + "num_tokens": 541903735.0, + "step": 20942 + }, + { + "epoch": 2.2999121458379093, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2264907360076904, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6980215311050415, + "num_tokens": 541935807.0, + "step": 20943 + }, + { + "epoch": 2.3000219635405226, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.402414083480835, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7274945974349976, + "num_tokens": 541962688.0, + "step": 20944 + }, + { + "epoch": 2.3001317812431363, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3762080669403076, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7073628902435303, + "num_tokens": 541988938.0, + "step": 20945 + }, + { + "epoch": 2.30024159894575, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7447245121002197, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7443757653236389, + "num_tokens": 542010466.0, + "step": 20946 + }, + { + "epoch": 2.300351416648364, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.514162302017212, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7202698588371277, + "num_tokens": 542034293.0, + "step": 20947 + }, + { + "epoch": 2.300461234350977, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5216915607452393, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6986497044563293, + "num_tokens": 542059149.0, + "step": 20948 + }, + { + "epoch": 2.300571052053591, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1609079837799072, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7117017507553101, + "num_tokens": 542089035.0, + "step": 20949 + }, + { + "epoch": 2.3006808697562047, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.492159605026245, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6818444728851318, + "num_tokens": 542116054.0, + "step": 20950 + }, + { + "epoch": 2.3007906874588184, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3184969425201416, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7036245465278625, + "num_tokens": 542145231.0, + "step": 20951 + }, + { + "epoch": 2.300900505161432, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2821497917175293, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.710163950920105, + "num_tokens": 542173544.0, + "step": 20952 + }, + { + "epoch": 2.3010103228640455, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3171963691711426, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6963177919387817, + "num_tokens": 542202205.0, + "step": 20953 + }, + { + "epoch": 2.3011201405666593, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4255917072296143, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7090572118759155, + "num_tokens": 542227842.0, + "step": 20954 + }, + { + "epoch": 2.301229958269273, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.221367120742798, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7106097340583801, + "num_tokens": 542255069.0, + "step": 20955 + }, + { + "epoch": 2.3013397759718868, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4142403602600098, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7149469256401062, + "num_tokens": 542282401.0, + "step": 20956 + }, + { + "epoch": 2.3014495936745005, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2951529026031494, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7405118942260742, + "num_tokens": 542308418.0, + "step": 20957 + }, + { + "epoch": 2.301559411377114, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5354790687561035, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7114042639732361, + "num_tokens": 542333647.0, + "step": 20958 + }, + { + "epoch": 2.3016692290797276, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4774954319000244, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7168022394180298, + "num_tokens": 542359218.0, + "step": 20959 + }, + { + "epoch": 2.3017790467823414, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3692820072174072, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.723677396774292, + "num_tokens": 542388516.0, + "step": 20960 + }, + { + "epoch": 2.301888864484955, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.363983631134033, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7023538947105408, + "num_tokens": 542417076.0, + "step": 20961 + }, + { + "epoch": 2.301998682187569, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.643437147140503, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7156685590744019, + "num_tokens": 542439792.0, + "step": 20962 + }, + { + "epoch": 2.302108499890182, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5197174549102783, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7079805731773376, + "num_tokens": 542469589.0, + "step": 20963 + }, + { + "epoch": 2.302218317592796, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4578258991241455, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.7066586017608643, + "num_tokens": 542494322.0, + "step": 20964 + }, + { + "epoch": 2.3023281352954097, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5079843997955322, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.735187292098999, + "num_tokens": 542519882.0, + "step": 20965 + }, + { + "epoch": 2.3024379529980235, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3475348949432373, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7086527347564697, + "num_tokens": 542548456.0, + "step": 20966 + }, + { + "epoch": 2.3025477707006368, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.622969388961792, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7137819528579712, + "num_tokens": 542573067.0, + "step": 20967 + }, + { + "epoch": 2.3026575884032505, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.308300733566284, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7058173418045044, + "num_tokens": 542602039.0, + "step": 20968 + }, + { + "epoch": 2.3027674061058643, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.728095054626465, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7323732972145081, + "num_tokens": 542622463.0, + "step": 20969 + }, + { + "epoch": 2.302877223808478, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4589617252349854, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6905695199966431, + "num_tokens": 542649997.0, + "step": 20970 + }, + { + "epoch": 2.3029870415110913, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.33634614944458, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7246324419975281, + "num_tokens": 542679946.0, + "step": 20971 + }, + { + "epoch": 2.303096859213705, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.512659788131714, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7201747894287109, + "num_tokens": 542704940.0, + "step": 20972 + }, + { + "epoch": 2.303206676916319, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.380012035369873, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7080238461494446, + "num_tokens": 542735386.0, + "step": 20973 + }, + { + "epoch": 2.3033164946189326, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4071011543273926, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7245455980300903, + "num_tokens": 542760679.0, + "step": 20974 + }, + { + "epoch": 2.3034263123215464, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.269623041152954, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.728094220161438, + "num_tokens": 542789594.0, + "step": 20975 + }, + { + "epoch": 2.3035361300241597, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.312822103500366, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7052916288375854, + "num_tokens": 542817666.0, + "step": 20976 + }, + { + "epoch": 2.3036459477267734, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5655643939971924, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7324123382568359, + "num_tokens": 542841983.0, + "step": 20977 + }, + { + "epoch": 2.303755765429387, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.587172031402588, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7191212177276611, + "num_tokens": 542869857.0, + "step": 20978 + }, + { + "epoch": 2.303865583132001, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2577459812164307, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7107539772987366, + "num_tokens": 542900501.0, + "step": 20979 + }, + { + "epoch": 2.3039754008346147, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.703725576400757, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7194151878356934, + "num_tokens": 542922771.0, + "step": 20980 + }, + { + "epoch": 2.304085218537228, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.50239634513855, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7365996837615967, + "num_tokens": 542945980.0, + "step": 20981 + }, + { + "epoch": 2.304195036239842, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6212656497955322, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.702342689037323, + "num_tokens": 542969733.0, + "step": 20982 + }, + { + "epoch": 2.3043048539424555, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.461758613586426, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7297539710998535, + "num_tokens": 542997008.0, + "step": 20983 + }, + { + "epoch": 2.3044146716450693, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.578209638595581, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7197999358177185, + "num_tokens": 543021143.0, + "step": 20984 + }, + { + "epoch": 2.304524489347683, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5211901664733887, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7284735441207886, + "num_tokens": 543045313.0, + "step": 20985 + }, + { + "epoch": 2.3046343070502964, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4830856323242188, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.725793719291687, + "num_tokens": 543068448.0, + "step": 20986 + }, + { + "epoch": 2.30474412475291, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2214834690093994, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7069885730743408, + "num_tokens": 543099282.0, + "step": 20987 + }, + { + "epoch": 2.304853942455524, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7544057369232178, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7171591520309448, + "num_tokens": 543120078.0, + "step": 20988 + }, + { + "epoch": 2.3049637601581376, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5078444480895996, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7182273268699646, + "num_tokens": 543143251.0, + "step": 20989 + }, + { + "epoch": 2.3050735778607514, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5465877056121826, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7044387459754944, + "num_tokens": 543167052.0, + "step": 20990 + }, + { + "epoch": 2.3051833955633647, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.744100332260132, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7119624018669128, + "num_tokens": 543188857.0, + "step": 20991 + }, + { + "epoch": 2.3052932132659785, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5084691047668457, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7187247276306152, + "num_tokens": 543215243.0, + "step": 20992 + }, + { + "epoch": 2.3054030309685922, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.595949172973633, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.711195707321167, + "num_tokens": 543239720.0, + "step": 20993 + }, + { + "epoch": 2.305512848671206, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7164368629455566, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7200592756271362, + "num_tokens": 543261966.0, + "step": 20994 + }, + { + "epoch": 2.3056226663738193, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7652337551116943, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7309106588363647, + "num_tokens": 543281693.0, + "step": 20995 + }, + { + "epoch": 2.305732484076433, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5719640254974365, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7264902591705322, + "num_tokens": 543303138.0, + "step": 20996 + }, + { + "epoch": 2.305842301779047, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6147453784942627, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6895691156387329, + "num_tokens": 543330442.0, + "step": 20997 + }, + { + "epoch": 2.3059521194816606, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 7.066443920135498, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7000735998153687, + "num_tokens": 543362534.0, + "step": 20998 + }, + { + "epoch": 2.306061937184274, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7847325801849365, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7225682735443115, + "num_tokens": 543382404.0, + "step": 20999 + }, + { + "epoch": 2.3061717548868876, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.574690580368042, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.7040629386901855, + "num_tokens": 543407543.0, + "step": 21000 + }, + { + "epoch": 2.3062815725895014, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5857181549072266, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7278372049331665, + "num_tokens": 543430168.0, + "step": 21001 + }, + { + "epoch": 2.306391390292115, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3366641998291016, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7086489200592041, + "num_tokens": 543458488.0, + "step": 21002 + }, + { + "epoch": 2.306501207994729, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6946187019348145, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7264916896820068, + "num_tokens": 543481233.0, + "step": 21003 + }, + { + "epoch": 2.306611025697342, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6141083240509033, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7139245271682739, + "num_tokens": 543504692.0, + "step": 21004 + }, + { + "epoch": 2.306720843399956, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.586935520172119, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7221686840057373, + "num_tokens": 543526621.0, + "step": 21005 + }, + { + "epoch": 2.3068306611025697, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.295684576034546, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7121468782424927, + "num_tokens": 543554414.0, + "step": 21006 + }, + { + "epoch": 2.3069404788051835, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2008893489837646, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6928530931472778, + "num_tokens": 543587486.0, + "step": 21007 + }, + { + "epoch": 2.3070502965077972, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3703761100769043, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6982523798942566, + "num_tokens": 543615678.0, + "step": 21008 + }, + { + "epoch": 2.3071601142104106, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.49548602104187, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7361774444580078, + "num_tokens": 543639927.0, + "step": 21009 + }, + { + "epoch": 2.3072699319130243, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.368091583251953, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6979877948760986, + "num_tokens": 543668013.0, + "step": 21010 + }, + { + "epoch": 2.307379749615638, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5473885536193848, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7325412034988403, + "num_tokens": 543692129.0, + "step": 21011 + }, + { + "epoch": 2.307489567318252, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5625035762786865, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7268958687782288, + "num_tokens": 543714848.0, + "step": 21012 + }, + { + "epoch": 2.3075993850208656, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.550827741622925, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7123505473136902, + "num_tokens": 543739756.0, + "step": 21013 + }, + { + "epoch": 2.307709202723479, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6795895099639893, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.710464596748352, + "num_tokens": 543761497.0, + "step": 21014 + }, + { + "epoch": 2.3078190204260927, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7370827198028564, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7172881364822388, + "num_tokens": 543783793.0, + "step": 21015 + }, + { + "epoch": 2.3079288381287064, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5195271968841553, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7055293321609497, + "num_tokens": 543809473.0, + "step": 21016 + }, + { + "epoch": 2.30803865583132, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.564131498336792, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.720588207244873, + "num_tokens": 543832119.0, + "step": 21017 + }, + { + "epoch": 2.3081484735339335, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.449220895767212, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7098220586776733, + "num_tokens": 543858050.0, + "step": 21018 + }, + { + "epoch": 2.3082582912365472, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4096834659576416, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7138530611991882, + "num_tokens": 543883102.0, + "step": 21019 + }, + { + "epoch": 2.308368108939161, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6548216342926025, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7350502610206604, + "num_tokens": 543904191.0, + "step": 21020 + }, + { + "epoch": 2.3084779266417748, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.673398494720459, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7192310690879822, + "num_tokens": 543925282.0, + "step": 21021 + }, + { + "epoch": 2.308587744344388, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7671425342559814, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7101295590400696, + "num_tokens": 543948199.0, + "step": 21022 + }, + { + "epoch": 2.308697562047002, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.107016086578369, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7384092807769775, + "num_tokens": 543978822.0, + "step": 21023 + }, + { + "epoch": 2.3088073797496156, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.264472723007202, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6948471665382385, + "num_tokens": 544009634.0, + "step": 21024 + }, + { + "epoch": 2.3089171974522293, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.136117935180664, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.7004985213279724, + "num_tokens": 544045825.0, + "step": 21025 + }, + { + "epoch": 2.309027015154843, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.566751003265381, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7236007452011108, + "num_tokens": 544066655.0, + "step": 21026 + }, + { + "epoch": 2.3091368328574564, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.6362357139587402, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7065964937210083, + "num_tokens": 544088602.0, + "step": 21027 + }, + { + "epoch": 2.30924665056007, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.517010450363159, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7264654636383057, + "num_tokens": 544111969.0, + "step": 21028 + }, + { + "epoch": 2.309356468262684, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5292763710021973, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7353060245513916, + "num_tokens": 544136419.0, + "step": 21029 + }, + { + "epoch": 2.3094662859652977, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.1833832263946533, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7087072730064392, + "num_tokens": 544167652.0, + "step": 21030 + }, + { + "epoch": 2.3095761036679114, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.298367738723755, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7100160717964172, + "num_tokens": 544196676.0, + "step": 21031 + }, + { + "epoch": 2.3096859213705248, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.462085008621216, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.722129225730896, + "num_tokens": 544222160.0, + "step": 21032 + }, + { + "epoch": 2.3097957390731385, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4476828575134277, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7001039981842041, + "num_tokens": 544250547.0, + "step": 21033 + }, + { + "epoch": 2.3099055567757523, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5912933349609375, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6997206211090088, + "num_tokens": 544275216.0, + "step": 21034 + }, + { + "epoch": 2.310015374478366, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 3.379549741744995, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7129544615745544, + "num_tokens": 544302957.0, + "step": 21035 + }, + { + "epoch": 2.31012519218098, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.484450340270996, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7039141058921814, + "num_tokens": 544330432.0, + "step": 21036 + }, + { + "epoch": 2.310235009883593, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.410865306854248, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7160836458206177, + "num_tokens": 544358067.0, + "step": 21037 + }, + { + "epoch": 2.310344827586207, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7980587482452393, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7695021629333496, + "num_tokens": 544378540.0, + "step": 21038 + }, + { + "epoch": 2.3104546452888206, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5408220291137695, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7435017824172974, + "num_tokens": 544402832.0, + "step": 21039 + }, + { + "epoch": 2.3105644629914344, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.5761420726776123, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7400104403495789, + "num_tokens": 544424797.0, + "step": 21040 + }, + { + "epoch": 2.310674280694048, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3741273880004883, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7082288265228271, + "num_tokens": 544457150.0, + "step": 21041 + }, + { + "epoch": 2.3107840983966614, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.18315052986145, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7177174091339111, + "num_tokens": 544488923.0, + "step": 21042 + }, + { + "epoch": 2.310893916099275, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.2503674030303955, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7191391587257385, + "num_tokens": 544518724.0, + "step": 21043 + }, + { + "epoch": 2.311003733801889, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.0761775970458984, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7225364446640015, + "num_tokens": 544551721.0, + "step": 21044 + }, + { + "epoch": 2.3111135515045027, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.581463575363159, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.702517032623291, + "num_tokens": 544575492.0, + "step": 21045 + }, + { + "epoch": 2.311223369207116, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3447248935699463, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7084211111068726, + "num_tokens": 544602952.0, + "step": 21046 + }, + { + "epoch": 2.3113331869097298, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4185478687286377, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7310368418693542, + "num_tokens": 544628679.0, + "step": 21047 + }, + { + "epoch": 2.3114430046123435, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4603447914123535, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7181355357170105, + "num_tokens": 544655406.0, + "step": 21048 + }, + { + "epoch": 2.3115528223149573, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.7157399654388428, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.7036459445953369, + "num_tokens": 544676038.0, + "step": 21049 + }, + { + "epoch": 2.3116626400175706, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.786036968231201, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7369968891143799, + "num_tokens": 544696723.0, + "step": 21050 + }, + { + "epoch": 2.3117724577201844, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.442986249923706, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7241789102554321, + "num_tokens": 544720954.0, + "step": 21051 + }, + { + "epoch": 2.311882275422798, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4042372703552246, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.713382363319397, + "num_tokens": 544748396.0, + "step": 21052 + }, + { + "epoch": 2.311992093125412, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5150699615478516, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7250207662582397, + "num_tokens": 544771867.0, + "step": 21053 + }, + { + "epoch": 2.3121019108280256, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5090925693511963, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7316566705703735, + "num_tokens": 544794970.0, + "step": 21054 + }, + { + "epoch": 2.312211728530639, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.440593719482422, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7294542789459229, + "num_tokens": 544821841.0, + "step": 21055 + }, + { + "epoch": 2.3123215462332527, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3659989833831787, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7097379565238953, + "num_tokens": 544849110.0, + "step": 21056 + }, + { + "epoch": 2.3124313639358665, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.3965961933135986, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7111977934837341, + "num_tokens": 544876829.0, + "step": 21057 + }, + { + "epoch": 2.31254118163848, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.324397563934326, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7322203516960144, + "num_tokens": 544904983.0, + "step": 21058 + }, + { + "epoch": 2.312650999341094, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.748889207839966, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7011928558349609, + "num_tokens": 544926799.0, + "step": 21059 + }, + { + "epoch": 2.3127608170437073, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4162588119506836, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6910440921783447, + "num_tokens": 544956576.0, + "step": 21060 + }, + { + "epoch": 2.312870634746321, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6635069847106934, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7343297600746155, + "num_tokens": 544977466.0, + "step": 21061 + }, + { + "epoch": 2.312980452448935, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.4143362045288086, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6972442269325256, + "num_tokens": 545006051.0, + "step": 21062 + }, + { + "epoch": 2.3130902701515486, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.427516460418701, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7016816139221191, + "num_tokens": 545032919.0, + "step": 21063 + }, + { + "epoch": 2.3132000878541623, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.395111560821533, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7117636203765869, + "num_tokens": 545058743.0, + "step": 21064 + }, + { + "epoch": 2.3133099055567756, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6239564418792725, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7312777042388916, + "num_tokens": 545080941.0, + "step": 21065 + }, + { + "epoch": 2.3134197232593894, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4290597438812256, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.7043829560279846, + "num_tokens": 545111743.0, + "step": 21066 + }, + { + "epoch": 2.313529540962003, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5478343963623047, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.7020613551139832, + "num_tokens": 545137372.0, + "step": 21067 + }, + { + "epoch": 2.313639358664617, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7664735317230225, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7079560160636902, + "num_tokens": 545161184.0, + "step": 21068 + }, + { + "epoch": 2.31374917636723, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5661067962646484, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.748967170715332, + "num_tokens": 545184304.0, + "step": 21069 + }, + { + "epoch": 2.313858994069844, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4152135848999023, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7071006894111633, + "num_tokens": 545210656.0, + "step": 21070 + }, + { + "epoch": 2.3139688117724577, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.577099084854126, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7396806478500366, + "num_tokens": 545235013.0, + "step": 21071 + }, + { + "epoch": 2.3140786294750715, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 2.378342866897583, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.710608720779419, + "num_tokens": 545261614.0, + "step": 21072 + }, + { + "epoch": 2.314188447177685, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.331054449081421, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7199592590332031, + "num_tokens": 545288530.0, + "step": 21073 + }, + { + "epoch": 2.3142982648802986, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3510959148406982, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7013862729072571, + "num_tokens": 545316653.0, + "step": 21074 + }, + { + "epoch": 2.3144080825829123, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.401870012283325, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7166447043418884, + "num_tokens": 545344921.0, + "step": 21075 + }, + { + "epoch": 2.314517900285526, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2345049381256104, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6929337382316589, + "num_tokens": 545375523.0, + "step": 21076 + }, + { + "epoch": 2.31462771798814, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.129676580429077, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7150560617446899, + "num_tokens": 545407587.0, + "step": 21077 + }, + { + "epoch": 2.314737535690753, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.505141019821167, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7265540361404419, + "num_tokens": 545432350.0, + "step": 21078 + }, + { + "epoch": 2.314847353393367, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.378775119781494, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7315439581871033, + "num_tokens": 545460827.0, + "step": 21079 + }, + { + "epoch": 2.3149571710959806, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.532072067260742, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7196134328842163, + "num_tokens": 545484607.0, + "step": 21080 + }, + { + "epoch": 2.3150669887985944, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.473440647125244, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7102786302566528, + "num_tokens": 545508854.0, + "step": 21081 + }, + { + "epoch": 2.315176806501208, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4019381999969482, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6892507076263428, + "num_tokens": 545536439.0, + "step": 21082 + }, + { + "epoch": 2.3152866242038215, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3758487701416016, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7109417915344238, + "num_tokens": 545562043.0, + "step": 21083 + }, + { + "epoch": 2.3153964419064352, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.377131462097168, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7147737741470337, + "num_tokens": 545589760.0, + "step": 21084 + }, + { + "epoch": 2.315506259609049, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4456441402435303, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7278085350990295, + "num_tokens": 545614440.0, + "step": 21085 + }, + { + "epoch": 2.3156160773116627, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3506438732147217, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7203695178031921, + "num_tokens": 545639962.0, + "step": 21086 + }, + { + "epoch": 2.3157258950142765, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.740870237350464, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6993454098701477, + "num_tokens": 545663324.0, + "step": 21087 + }, + { + "epoch": 2.31583571271689, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6159827709198, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7359115481376648, + "num_tokens": 545685668.0, + "step": 21088 + }, + { + "epoch": 2.3159455304195036, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.601783275604248, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7131918668746948, + "num_tokens": 545709631.0, + "step": 21089 + }, + { + "epoch": 2.3160553481221173, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5817039012908936, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.731979250907898, + "num_tokens": 545733222.0, + "step": 21090 + }, + { + "epoch": 2.316165165824731, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.464278221130371, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7035976648330688, + "num_tokens": 545759942.0, + "step": 21091 + }, + { + "epoch": 2.316274983527345, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.327927827835083, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7294192314147949, + "num_tokens": 545785642.0, + "step": 21092 + }, + { + "epoch": 2.316384801229958, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2965896129608154, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7237209677696228, + "num_tokens": 545814865.0, + "step": 21093 + }, + { + "epoch": 2.316494618932572, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.519716262817383, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7079448699951172, + "num_tokens": 545838748.0, + "step": 21094 + }, + { + "epoch": 2.3166044366351857, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.043466567993164, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.7021876573562622, + "num_tokens": 545876189.0, + "step": 21095 + }, + { + "epoch": 2.3167142543377994, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2240917682647705, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6996884346008301, + "num_tokens": 545906308.0, + "step": 21096 + }, + { + "epoch": 2.3168240720404127, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.417067050933838, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7199490070343018, + "num_tokens": 545931183.0, + "step": 21097 + }, + { + "epoch": 2.3169338897430265, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.275141716003418, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7352725863456726, + "num_tokens": 545958608.0, + "step": 21098 + }, + { + "epoch": 2.3170437074456403, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.242198944091797, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7088782787322998, + "num_tokens": 545988236.0, + "step": 21099 + }, + { + "epoch": 2.317153525148254, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.279057264328003, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.711754322052002, + "num_tokens": 546016863.0, + "step": 21100 + }, + { + "epoch": 2.3172633428508673, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.608614683151245, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7382649183273315, + "num_tokens": 546038858.0, + "step": 21101 + }, + { + "epoch": 2.317373160553481, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5844473838806152, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7298848032951355, + "num_tokens": 546064374.0, + "step": 21102 + }, + { + "epoch": 2.317482978256095, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.321854829788208, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7269275188446045, + "num_tokens": 546091560.0, + "step": 21103 + }, + { + "epoch": 2.3175927959587086, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2264628410339355, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7202646136283875, + "num_tokens": 546121413.0, + "step": 21104 + }, + { + "epoch": 2.3177026136613224, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.486478805541992, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6915807723999023, + "num_tokens": 546146179.0, + "step": 21105 + }, + { + "epoch": 2.3178124313639357, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4659829139709473, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7342262268066406, + "num_tokens": 546169669.0, + "step": 21106 + }, + { + "epoch": 2.3179222490665494, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7720069885253906, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6922764778137207, + "num_tokens": 546196400.0, + "step": 21107 + }, + { + "epoch": 2.318032066769163, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.594637870788574, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.718339204788208, + "num_tokens": 546219794.0, + "step": 21108 + }, + { + "epoch": 2.318141884471777, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1008503437042236, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6993653774261475, + "num_tokens": 546252721.0, + "step": 21109 + }, + { + "epoch": 2.3182517021743907, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5456717014312744, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7450658679008484, + "num_tokens": 546275903.0, + "step": 21110 + }, + { + "epoch": 2.318361519877004, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.9190077781677246, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7244898080825806, + "num_tokens": 546296258.0, + "step": 21111 + }, + { + "epoch": 2.3184713375796178, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6916465759277344, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.709929883480072, + "num_tokens": 546320414.0, + "step": 21112 + }, + { + "epoch": 2.3185811552822315, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6198618412017822, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7384291291236877, + "num_tokens": 546341684.0, + "step": 21113 + }, + { + "epoch": 2.3186909729848453, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5679569244384766, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7460487484931946, + "num_tokens": 546362874.0, + "step": 21114 + }, + { + "epoch": 2.318800790687459, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4192662239074707, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7026983499526978, + "num_tokens": 546390052.0, + "step": 21115 + }, + { + "epoch": 2.3189106083900723, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3112080097198486, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7074892520904541, + "num_tokens": 546419092.0, + "step": 21116 + }, + { + "epoch": 2.319020426092686, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.640601873397827, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7162657976150513, + "num_tokens": 546441330.0, + "step": 21117 + }, + { + "epoch": 2.3191302437953, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7168848514556885, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7168206572532654, + "num_tokens": 546462439.0, + "step": 21118 + }, + { + "epoch": 2.3192400614979136, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4112563133239746, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7446384429931641, + "num_tokens": 546487977.0, + "step": 21119 + }, + { + "epoch": 2.319349879200527, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 3.1833455562591553, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7336076498031616, + "num_tokens": 546505219.0, + "step": 21120 + }, + { + "epoch": 2.3194596969031407, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.332348108291626, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7158096432685852, + "num_tokens": 546533972.0, + "step": 21121 + }, + { + "epoch": 2.3195695146057544, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.501943588256836, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7029138803482056, + "num_tokens": 546559537.0, + "step": 21122 + }, + { + "epoch": 2.319679332308368, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5376169681549072, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6976292133331299, + "num_tokens": 546583319.0, + "step": 21123 + }, + { + "epoch": 2.319789150010982, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.364690065383911, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.716274082660675, + "num_tokens": 546609774.0, + "step": 21124 + }, + { + "epoch": 2.3198989677135953, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.368056058883667, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7159344553947449, + "num_tokens": 546637470.0, + "step": 21125 + }, + { + "epoch": 2.320008785416209, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3395700454711914, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7308595776557922, + "num_tokens": 546663033.0, + "step": 21126 + }, + { + "epoch": 2.320118603118823, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6576597690582275, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6769400835037231, + "num_tokens": 546686081.0, + "step": 21127 + }, + { + "epoch": 2.3202284208214365, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5117027759552, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7282081842422485, + "num_tokens": 546711171.0, + "step": 21128 + }, + { + "epoch": 2.32033823852405, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.746555805206299, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6994388699531555, + "num_tokens": 546733502.0, + "step": 21129 + }, + { + "epoch": 2.3204480562266636, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3901729583740234, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7029073238372803, + "num_tokens": 546762358.0, + "step": 21130 + }, + { + "epoch": 2.3205578739292774, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4614057540893555, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7133439183235168, + "num_tokens": 546787678.0, + "step": 21131 + }, + { + "epoch": 2.320667691631891, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2513229846954346, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7071508765220642, + "num_tokens": 546818176.0, + "step": 21132 + }, + { + "epoch": 2.320777509334505, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5346057415008545, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7170372605323792, + "num_tokens": 546842222.0, + "step": 21133 + }, + { + "epoch": 2.320887327037118, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.513458251953125, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7135301828384399, + "num_tokens": 546866707.0, + "step": 21134 + }, + { + "epoch": 2.320997144739732, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5115199089050293, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7258719801902771, + "num_tokens": 546891770.0, + "step": 21135 + }, + { + "epoch": 2.3211069624423457, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6450753211975098, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7099560499191284, + "num_tokens": 546915119.0, + "step": 21136 + }, + { + "epoch": 2.3212167801449595, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2687923908233643, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6981899738311768, + "num_tokens": 546946013.0, + "step": 21137 + }, + { + "epoch": 2.3213265978475732, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.503586769104004, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.730732798576355, + "num_tokens": 546971462.0, + "step": 21138 + }, + { + "epoch": 2.3214364155501865, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.347537040710449, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7110109925270081, + "num_tokens": 547000981.0, + "step": 21139 + }, + { + "epoch": 2.3215462332528003, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.544214963912964, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7316961288452148, + "num_tokens": 547025222.0, + "step": 21140 + }, + { + "epoch": 2.321656050955414, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6117899417877197, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7131334543228149, + "num_tokens": 547049261.0, + "step": 21141 + }, + { + "epoch": 2.321765868658028, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6454594135284424, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7309167981147766, + "num_tokens": 547071238.0, + "step": 21142 + }, + { + "epoch": 2.3218756863606416, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2758939266204834, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.702462375164032, + "num_tokens": 547102348.0, + "step": 21143 + }, + { + "epoch": 2.321985504063255, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.353257894515991, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7088198661804199, + "num_tokens": 547130826.0, + "step": 21144 + }, + { + "epoch": 2.3220953217658686, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7645950317382812, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.72447669506073, + "num_tokens": 547152390.0, + "step": 21145 + }, + { + "epoch": 2.3222051394684824, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.637728214263916, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7108786702156067, + "num_tokens": 547176962.0, + "step": 21146 + }, + { + "epoch": 2.322314957171096, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4719295501708984, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7121880054473877, + "num_tokens": 547201818.0, + "step": 21147 + }, + { + "epoch": 2.3224247748737095, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4827096462249756, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7232736945152283, + "num_tokens": 547227966.0, + "step": 21148 + }, + { + "epoch": 2.322534592576323, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3608558177948, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7301523685455322, + "num_tokens": 547254343.0, + "step": 21149 + }, + { + "epoch": 2.322644410278937, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.485873222351074, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7218175530433655, + "num_tokens": 547281730.0, + "step": 21150 + }, + { + "epoch": 2.3227542279815507, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2845776081085205, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7089536190032959, + "num_tokens": 547310001.0, + "step": 21151 + }, + { + "epoch": 2.322864045684164, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7545840740203857, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.732100248336792, + "num_tokens": 547331369.0, + "step": 21152 + }, + { + "epoch": 2.322973863386778, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4825382232666016, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7148376107215881, + "num_tokens": 547357263.0, + "step": 21153 + }, + { + "epoch": 2.3230836810893916, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3132102489471436, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6945701837539673, + "num_tokens": 547386568.0, + "step": 21154 + }, + { + "epoch": 2.3231934987920053, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.068932056427002, + "learning_rate": 1e-06, + "loss": 1.1243, + "mean_token_accuracy": 0.6726060509681702, + "num_tokens": 547422711.0, + "step": 21155 + }, + { + "epoch": 2.323303316494619, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3503246307373047, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6970890760421753, + "num_tokens": 547448999.0, + "step": 21156 + }, + { + "epoch": 2.3234131341972324, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5077614784240723, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7177482843399048, + "num_tokens": 547473272.0, + "step": 21157 + }, + { + "epoch": 2.323522951899846, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8366174697875977, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7216705083847046, + "num_tokens": 547492875.0, + "step": 21158 + }, + { + "epoch": 2.32363276960246, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1686289310455322, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7011173367500305, + "num_tokens": 547526202.0, + "step": 21159 + }, + { + "epoch": 2.3237425873050737, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.506650686264038, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7139872312545776, + "num_tokens": 547551740.0, + "step": 21160 + }, + { + "epoch": 2.3238524050076874, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.600126266479492, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.7036777138710022, + "num_tokens": 547576042.0, + "step": 21161 + }, + { + "epoch": 2.3239622227103007, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5048515796661377, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7022964954376221, + "num_tokens": 547605542.0, + "step": 21162 + }, + { + "epoch": 2.3240720404129145, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1544747352600098, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6882305145263672, + "num_tokens": 547636849.0, + "step": 21163 + }, + { + "epoch": 2.3241818581155282, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.558248281478882, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7014822959899902, + "num_tokens": 547661953.0, + "step": 21164 + }, + { + "epoch": 2.324291675818142, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.538025379180908, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7279210686683655, + "num_tokens": 547685654.0, + "step": 21165 + }, + { + "epoch": 2.3244014935207558, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3454794883728027, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.7003247737884521, + "num_tokens": 547712398.0, + "step": 21166 + }, + { + "epoch": 2.324511311223369, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.398970603942871, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7176198363304138, + "num_tokens": 547737811.0, + "step": 21167 + }, + { + "epoch": 2.324621128925983, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.550513505935669, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7142918109893799, + "num_tokens": 547762265.0, + "step": 21168 + }, + { + "epoch": 2.3247309466285966, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6961357593536377, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7244547605514526, + "num_tokens": 547783120.0, + "step": 21169 + }, + { + "epoch": 2.3248407643312103, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1964187622070312, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6970615386962891, + "num_tokens": 547814700.0, + "step": 21170 + }, + { + "epoch": 2.324950582033824, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.362656593322754, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6962192058563232, + "num_tokens": 547842383.0, + "step": 21171 + }, + { + "epoch": 2.3250603997364374, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.500462532043457, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7064062356948853, + "num_tokens": 547868389.0, + "step": 21172 + }, + { + "epoch": 2.325170217439051, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.224776268005371, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7042747735977173, + "num_tokens": 547898639.0, + "step": 21173 + }, + { + "epoch": 2.325280035141665, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3630216121673584, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7193621397018433, + "num_tokens": 547925825.0, + "step": 21174 + }, + { + "epoch": 2.3253898528442787, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5947041511535645, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7168618440628052, + "num_tokens": 547949645.0, + "step": 21175 + }, + { + "epoch": 2.325499670546892, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6295557022094727, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.702338695526123, + "num_tokens": 547974075.0, + "step": 21176 + }, + { + "epoch": 2.3256094882495058, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4600272178649902, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7151212692260742, + "num_tokens": 548000066.0, + "step": 21177 + }, + { + "epoch": 2.3257193059521195, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.293626070022583, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6968504190444946, + "num_tokens": 548028726.0, + "step": 21178 + }, + { + "epoch": 2.3258291236547333, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5445992946624756, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7428398132324219, + "num_tokens": 548050967.0, + "step": 21179 + }, + { + "epoch": 2.3259389413573466, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.496901273727417, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7142949104309082, + "num_tokens": 548075738.0, + "step": 21180 + }, + { + "epoch": 2.3260487590599603, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.698057174682617, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7495792508125305, + "num_tokens": 548097259.0, + "step": 21181 + }, + { + "epoch": 2.326158576762574, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6097500324249268, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6997454166412354, + "num_tokens": 548128313.0, + "step": 21182 + }, + { + "epoch": 2.326268394465188, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.471450090408325, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7093816995620728, + "num_tokens": 548155143.0, + "step": 21183 + }, + { + "epoch": 2.3263782121678016, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7256667613983154, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7205920815467834, + "num_tokens": 548176848.0, + "step": 21184 + }, + { + "epoch": 2.326488029870415, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8852875232696533, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7312653064727783, + "num_tokens": 548196503.0, + "step": 21185 + }, + { + "epoch": 2.3265978475730287, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8972041606903076, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7264103889465332, + "num_tokens": 548214433.0, + "step": 21186 + }, + { + "epoch": 2.3267076652756424, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.472614288330078, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7001121640205383, + "num_tokens": 548242208.0, + "step": 21187 + }, + { + "epoch": 2.326817482978256, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4278275966644287, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7153571844100952, + "num_tokens": 548267871.0, + "step": 21188 + }, + { + "epoch": 2.32692730068087, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.9386308193206787, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7215867638587952, + "num_tokens": 548285796.0, + "step": 21189 + }, + { + "epoch": 2.3270371183834833, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.411562442779541, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7112725973129272, + "num_tokens": 548314029.0, + "step": 21190 + }, + { + "epoch": 2.327146936086097, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.616666078567505, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7416621446609497, + "num_tokens": 548339195.0, + "step": 21191 + }, + { + "epoch": 2.3272567537887108, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4760167598724365, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7100273370742798, + "num_tokens": 548363915.0, + "step": 21192 + }, + { + "epoch": 2.3273665714913245, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3216588497161865, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7093778252601624, + "num_tokens": 548392975.0, + "step": 21193 + }, + { + "epoch": 2.3274763891939383, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.200138807296753, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.689873218536377, + "num_tokens": 548423278.0, + "step": 21194 + }, + { + "epoch": 2.3275862068965516, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3748605251312256, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7166855931282043, + "num_tokens": 548451510.0, + "step": 21195 + }, + { + "epoch": 2.3276960245991654, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4216957092285156, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.723655641078949, + "num_tokens": 548477218.0, + "step": 21196 + }, + { + "epoch": 2.327805842301779, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5976078510284424, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7331873178482056, + "num_tokens": 548503026.0, + "step": 21197 + }, + { + "epoch": 2.327915660004393, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.960960626602173, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7216417789459229, + "num_tokens": 548525537.0, + "step": 21198 + }, + { + "epoch": 2.328025477707006, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.392404317855835, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7043092846870422, + "num_tokens": 548554097.0, + "step": 21199 + }, + { + "epoch": 2.32813529540962, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5459229946136475, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7180955410003662, + "num_tokens": 548578518.0, + "step": 21200 + }, + { + "epoch": 2.3282451131122337, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.491220235824585, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.716261088848114, + "num_tokens": 548603344.0, + "step": 21201 + }, + { + "epoch": 2.3283549308148475, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3481345176696777, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7319560647010803, + "num_tokens": 548627826.0, + "step": 21202 + }, + { + "epoch": 2.3284647485174608, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.50050687789917, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7258869409561157, + "num_tokens": 548652117.0, + "step": 21203 + }, + { + "epoch": 2.3285745662200745, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5562448501586914, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7315646409988403, + "num_tokens": 548677237.0, + "step": 21204 + }, + { + "epoch": 2.3286843839226883, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1898884773254395, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7120739221572876, + "num_tokens": 548709854.0, + "step": 21205 + }, + { + "epoch": 2.328794201625302, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3492019176483154, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7041298151016235, + "num_tokens": 548736894.0, + "step": 21206 + }, + { + "epoch": 2.328904019327916, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5774686336517334, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7254705429077148, + "num_tokens": 548758761.0, + "step": 21207 + }, + { + "epoch": 2.329013837030529, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.545327663421631, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7080826759338379, + "num_tokens": 548783443.0, + "step": 21208 + }, + { + "epoch": 2.329123654733143, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4873411655426025, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6999979019165039, + "num_tokens": 548809750.0, + "step": 21209 + }, + { + "epoch": 2.3292334724357566, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.277675151824951, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7109739780426025, + "num_tokens": 548839854.0, + "step": 21210 + }, + { + "epoch": 2.3293432901383704, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.447155714035034, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.740845263004303, + "num_tokens": 548864134.0, + "step": 21211 + }, + { + "epoch": 2.329453107840984, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.387188196182251, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6905066967010498, + "num_tokens": 548893718.0, + "step": 21212 + }, + { + "epoch": 2.3295629255435975, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.452671766281128, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7012336850166321, + "num_tokens": 548920479.0, + "step": 21213 + }, + { + "epoch": 2.329672743246211, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6550910472869873, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7078601121902466, + "num_tokens": 548944177.0, + "step": 21214 + }, + { + "epoch": 2.329782560948825, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6488282680511475, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6979420185089111, + "num_tokens": 548969981.0, + "step": 21215 + }, + { + "epoch": 2.3298923786514387, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7522878646850586, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6998571157455444, + "num_tokens": 548991299.0, + "step": 21216 + }, + { + "epoch": 2.3300021963540525, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.585785150527954, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7187315225601196, + "num_tokens": 549013991.0, + "step": 21217 + }, + { + "epoch": 2.330112014056666, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4430837631225586, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.745468258857727, + "num_tokens": 549038032.0, + "step": 21218 + }, + { + "epoch": 2.3302218317592795, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5757133960723877, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6896113753318787, + "num_tokens": 549064922.0, + "step": 21219 + }, + { + "epoch": 2.3303316494618933, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.151454448699951, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6852682828903198, + "num_tokens": 549100841.0, + "step": 21220 + }, + { + "epoch": 2.330441467164507, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.682081460952759, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7011964321136475, + "num_tokens": 549125312.0, + "step": 21221 + }, + { + "epoch": 2.330551284867121, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8347725868225098, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7226676940917969, + "num_tokens": 549144977.0, + "step": 21222 + }, + { + "epoch": 2.330661102569734, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.15617299079895, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6864932775497437, + "num_tokens": 549178168.0, + "step": 21223 + }, + { + "epoch": 2.330770920272348, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.46317720413208, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7197813391685486, + "num_tokens": 549203052.0, + "step": 21224 + }, + { + "epoch": 2.3308807379749616, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.603972911834717, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7403935194015503, + "num_tokens": 549226087.0, + "step": 21225 + }, + { + "epoch": 2.3309905556775754, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.500673770904541, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7389448881149292, + "num_tokens": 549251485.0, + "step": 21226 + }, + { + "epoch": 2.3311003733801887, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6015689373016357, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7023530602455139, + "num_tokens": 549280757.0, + "step": 21227 + }, + { + "epoch": 2.3312101910828025, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 3.810305595397949, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7122449278831482, + "num_tokens": 549311095.0, + "step": 21228 + }, + { + "epoch": 2.3313200087854162, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4827053546905518, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7195788621902466, + "num_tokens": 549336923.0, + "step": 21229 + }, + { + "epoch": 2.33142982648803, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2865233421325684, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6996042728424072, + "num_tokens": 549367414.0, + "step": 21230 + }, + { + "epoch": 2.3315396441906433, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.68599009513855, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7219340801239014, + "num_tokens": 549390980.0, + "step": 21231 + }, + { + "epoch": 2.331649461893257, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6290600299835205, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7210750579833984, + "num_tokens": 549415582.0, + "step": 21232 + }, + { + "epoch": 2.331759279595871, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3808281421661377, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6922826766967773, + "num_tokens": 549442023.0, + "step": 21233 + }, + { + "epoch": 2.3318690972984846, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3165907859802246, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7280908226966858, + "num_tokens": 549468923.0, + "step": 21234 + }, + { + "epoch": 2.3319789150010983, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.586301803588867, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6978722214698792, + "num_tokens": 549493366.0, + "step": 21235 + }, + { + "epoch": 2.3320887327037116, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2608582973480225, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7094591856002808, + "num_tokens": 549521336.0, + "step": 21236 + }, + { + "epoch": 2.3321985504063254, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.51131272315979, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7000465393066406, + "num_tokens": 549547172.0, + "step": 21237 + }, + { + "epoch": 2.332308368108939, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8611459732055664, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7356995344161987, + "num_tokens": 549566702.0, + "step": 21238 + }, + { + "epoch": 2.332418185811553, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 3.870969772338867, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7332088351249695, + "num_tokens": 549591628.0, + "step": 21239 + }, + { + "epoch": 2.3325280035141667, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3533504009246826, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7026960253715515, + "num_tokens": 549618880.0, + "step": 21240 + }, + { + "epoch": 2.33263782121678, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.056255340576172, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7121121287345886, + "num_tokens": 549652810.0, + "step": 21241 + }, + { + "epoch": 2.3327476389193937, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.842907428741455, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7521733045578003, + "num_tokens": 549671444.0, + "step": 21242 + }, + { + "epoch": 2.3328574566220075, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.732630491256714, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7373404502868652, + "num_tokens": 549691769.0, + "step": 21243 + }, + { + "epoch": 2.3329672743246213, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.647239923477173, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7150631546974182, + "num_tokens": 549715034.0, + "step": 21244 + }, + { + "epoch": 2.333077092027235, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3897573947906494, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7206012010574341, + "num_tokens": 549740360.0, + "step": 21245 + }, + { + "epoch": 2.3331869097298483, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4166133403778076, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7410069108009338, + "num_tokens": 549766285.0, + "step": 21246 + }, + { + "epoch": 2.333296727432462, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.267713785171509, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.694284200668335, + "num_tokens": 549796952.0, + "step": 21247 + }, + { + "epoch": 2.333406545135076, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.817412853240967, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.734238862991333, + "num_tokens": 549816546.0, + "step": 21248 + }, + { + "epoch": 2.3335163628376896, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.43212890625, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.719149649143219, + "num_tokens": 549840980.0, + "step": 21249 + }, + { + "epoch": 2.333626180540303, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6805901527404785, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6940392255783081, + "num_tokens": 549865581.0, + "step": 21250 + }, + { + "epoch": 2.3337359982429167, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.478438377380371, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.712907075881958, + "num_tokens": 549893505.0, + "step": 21251 + }, + { + "epoch": 2.3338458159455304, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 32.6196174621582, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.720740556716919, + "num_tokens": 549913166.0, + "step": 21252 + }, + { + "epoch": 2.333955633648144, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6556954383850098, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7155418395996094, + "num_tokens": 549936725.0, + "step": 21253 + }, + { + "epoch": 2.334065451350758, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.309089422225952, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7203404307365417, + "num_tokens": 549965484.0, + "step": 21254 + }, + { + "epoch": 2.3341752690533712, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4162540435791016, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7178834676742554, + "num_tokens": 549990882.0, + "step": 21255 + }, + { + "epoch": 2.334285086755985, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3340556621551514, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6919752955436707, + "num_tokens": 550019734.0, + "step": 21256 + }, + { + "epoch": 2.3343949044585988, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6735293865203857, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7301709651947021, + "num_tokens": 550040980.0, + "step": 21257 + }, + { + "epoch": 2.3345047221612125, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.462977409362793, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7031444907188416, + "num_tokens": 550065783.0, + "step": 21258 + }, + { + "epoch": 2.334614539863826, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2345266342163086, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7354044914245605, + "num_tokens": 550094010.0, + "step": 21259 + }, + { + "epoch": 2.3347243575664396, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7015926837921143, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7092021703720093, + "num_tokens": 550114344.0, + "step": 21260 + }, + { + "epoch": 2.3348341752690533, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.536590814590454, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7092909216880798, + "num_tokens": 550142710.0, + "step": 21261 + }, + { + "epoch": 2.334943992971667, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.404254674911499, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7218517065048218, + "num_tokens": 550168823.0, + "step": 21262 + }, + { + "epoch": 2.335053810674281, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4473860263824463, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7170693874359131, + "num_tokens": 550193547.0, + "step": 21263 + }, + { + "epoch": 2.335163628376894, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.569471597671509, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7168635129928589, + "num_tokens": 550219685.0, + "step": 21264 + }, + { + "epoch": 2.335273446079508, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3151354789733887, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6992018222808838, + "num_tokens": 550248353.0, + "step": 21265 + }, + { + "epoch": 2.3353832637821217, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3034119606018066, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.727257490158081, + "num_tokens": 550278798.0, + "step": 21266 + }, + { + "epoch": 2.3354930814847354, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4555954933166504, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7008233666419983, + "num_tokens": 550306111.0, + "step": 21267 + }, + { + "epoch": 2.335602899187349, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.347897529602051, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7282254099845886, + "num_tokens": 550333461.0, + "step": 21268 + }, + { + "epoch": 2.3357127168899625, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6145992279052734, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7633951902389526, + "num_tokens": 550354430.0, + "step": 21269 + }, + { + "epoch": 2.3358225345925763, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5615570545196533, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7129241228103638, + "num_tokens": 550379375.0, + "step": 21270 + }, + { + "epoch": 2.33593235229519, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3108298778533936, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7040315270423889, + "num_tokens": 550408939.0, + "step": 21271 + }, + { + "epoch": 2.336042169997804, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8165066242218018, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7196496725082397, + "num_tokens": 550430170.0, + "step": 21272 + }, + { + "epoch": 2.3361519877004175, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5240230560302734, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7366757392883301, + "num_tokens": 550453836.0, + "step": 21273 + }, + { + "epoch": 2.336261805403031, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.137497663497925, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7045654058456421, + "num_tokens": 550485340.0, + "step": 21274 + }, + { + "epoch": 2.3363716231056446, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.107553720474243, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6896060109138489, + "num_tokens": 550520121.0, + "step": 21275 + }, + { + "epoch": 2.3364814408082584, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.555147886276245, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7225594520568848, + "num_tokens": 550545685.0, + "step": 21276 + }, + { + "epoch": 2.336591258510872, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3020131587982178, + "learning_rate": 1e-06, + "loss": 1.0982, + "mean_token_accuracy": 0.6801249980926514, + "num_tokens": 550576946.0, + "step": 21277 + }, + { + "epoch": 2.3367010762134854, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7344865798950195, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.733518660068512, + "num_tokens": 550598771.0, + "step": 21278 + }, + { + "epoch": 2.336810893916099, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4132163524627686, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.705985426902771, + "num_tokens": 550625923.0, + "step": 21279 + }, + { + "epoch": 2.336920711618713, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.263456344604492, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7072685360908508, + "num_tokens": 550654256.0, + "step": 21280 + }, + { + "epoch": 2.3370305293213267, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.234496831893921, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.70258629322052, + "num_tokens": 550684929.0, + "step": 21281 + }, + { + "epoch": 2.33714034702394, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3449769020080566, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7054453492164612, + "num_tokens": 550712460.0, + "step": 21282 + }, + { + "epoch": 2.337250164726554, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3848841190338135, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7157219052314758, + "num_tokens": 550740829.0, + "step": 21283 + }, + { + "epoch": 2.3373599824291675, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.391840696334839, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7159351706504822, + "num_tokens": 550767310.0, + "step": 21284 + }, + { + "epoch": 2.3374698001317813, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3396377563476562, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7287862300872803, + "num_tokens": 550793297.0, + "step": 21285 + }, + { + "epoch": 2.337579617834395, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.62345552444458, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7203655242919922, + "num_tokens": 550814235.0, + "step": 21286 + }, + { + "epoch": 2.3376894355370084, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4493613243103027, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7394832372665405, + "num_tokens": 550838545.0, + "step": 21287 + }, + { + "epoch": 2.337799253239622, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6688318252563477, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.71198570728302, + "num_tokens": 550861019.0, + "step": 21288 + }, + { + "epoch": 2.337909070942236, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.610511064529419, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.714993953704834, + "num_tokens": 550884329.0, + "step": 21289 + }, + { + "epoch": 2.3380188886448496, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.214472770690918, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7035502791404724, + "num_tokens": 550914916.0, + "step": 21290 + }, + { + "epoch": 2.3381287063474634, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.712740659713745, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7257447838783264, + "num_tokens": 550937653.0, + "step": 21291 + }, + { + "epoch": 2.3382385240500767, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.398416519165039, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7030546069145203, + "num_tokens": 550964748.0, + "step": 21292 + }, + { + "epoch": 2.3383483417526905, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5151572227478027, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7192189693450928, + "num_tokens": 550988746.0, + "step": 21293 + }, + { + "epoch": 2.338458159455304, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.442735195159912, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7293223142623901, + "num_tokens": 551012690.0, + "step": 21294 + }, + { + "epoch": 2.338567977157918, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1163265705108643, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7215997576713562, + "num_tokens": 551046199.0, + "step": 21295 + }, + { + "epoch": 2.3386777948605317, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4053778648376465, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7065473794937134, + "num_tokens": 551072392.0, + "step": 21296 + }, + { + "epoch": 2.338787612563145, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4199419021606445, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7006151676177979, + "num_tokens": 551096181.0, + "step": 21297 + }, + { + "epoch": 2.338897430265759, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.334860324859619, + "learning_rate": 1e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.6834969520568848, + "num_tokens": 551126784.0, + "step": 21298 + }, + { + "epoch": 2.3390072479683726, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.214419364929199, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7016627788543701, + "num_tokens": 551157975.0, + "step": 21299 + }, + { + "epoch": 2.3391170656709863, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.254338026046753, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7002167701721191, + "num_tokens": 551188296.0, + "step": 21300 + }, + { + "epoch": 2.3392268833735996, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6856937408447266, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7344518899917603, + "num_tokens": 551209808.0, + "step": 21301 + }, + { + "epoch": 2.3393367010762134, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.320430278778076, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7148278951644897, + "num_tokens": 551237982.0, + "step": 21302 + }, + { + "epoch": 2.339446518778827, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.453622341156006, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.720470666885376, + "num_tokens": 551263064.0, + "step": 21303 + }, + { + "epoch": 2.339556336481441, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3517065048217773, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7240795493125916, + "num_tokens": 551290330.0, + "step": 21304 + }, + { + "epoch": 2.3396661541840547, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1257288455963135, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7245913743972778, + "num_tokens": 551324588.0, + "step": 21305 + }, + { + "epoch": 2.339775971886668, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5756304264068604, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7143247127532959, + "num_tokens": 551346696.0, + "step": 21306 + }, + { + "epoch": 2.3398857895892817, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4250030517578125, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6922256946563721, + "num_tokens": 551374637.0, + "step": 21307 + }, + { + "epoch": 2.3399956072918955, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.284578800201416, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.71033775806427, + "num_tokens": 551402293.0, + "step": 21308 + }, + { + "epoch": 2.3401054249945092, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3827295303344727, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.702473521232605, + "num_tokens": 551430867.0, + "step": 21309 + }, + { + "epoch": 2.3402152426971226, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5509109497070312, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6961155533790588, + "num_tokens": 551456102.0, + "step": 21310 + }, + { + "epoch": 2.3403250603997363, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6707661151885986, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7116639614105225, + "num_tokens": 551479576.0, + "step": 21311 + }, + { + "epoch": 2.34043487810235, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2769486904144287, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.749506950378418, + "num_tokens": 551508248.0, + "step": 21312 + }, + { + "epoch": 2.340544695804964, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7680277824401855, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7111817598342896, + "num_tokens": 551529376.0, + "step": 21313 + }, + { + "epoch": 2.3406545135075776, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3915066719055176, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7223924398422241, + "num_tokens": 551554610.0, + "step": 21314 + }, + { + "epoch": 2.340764331210191, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6325411796569824, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7165473699569702, + "num_tokens": 551579653.0, + "step": 21315 + }, + { + "epoch": 2.3408741489128047, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4430954456329346, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7237163186073303, + "num_tokens": 551603025.0, + "step": 21316 + }, + { + "epoch": 2.3409839666154184, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3658885955810547, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7126952409744263, + "num_tokens": 551631007.0, + "step": 21317 + }, + { + "epoch": 2.341093784318032, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.281808614730835, + "learning_rate": 1e-06, + "loss": 1.0842, + "mean_token_accuracy": 0.6807739734649658, + "num_tokens": 551662782.0, + "step": 21318 + }, + { + "epoch": 2.341203602020646, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7862548828125, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7140929698944092, + "num_tokens": 551683837.0, + "step": 21319 + }, + { + "epoch": 2.3413134197232592, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.631126642227173, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7403064966201782, + "num_tokens": 551705496.0, + "step": 21320 + }, + { + "epoch": 2.341423237425873, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7053821086883545, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7077056169509888, + "num_tokens": 551728394.0, + "step": 21321 + }, + { + "epoch": 2.3415330551284868, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.622744083404541, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7292730212211609, + "num_tokens": 551748347.0, + "step": 21322 + }, + { + "epoch": 2.3416428728311005, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5560004711151123, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7270825505256653, + "num_tokens": 551771301.0, + "step": 21323 + }, + { + "epoch": 2.3417526905337143, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5865769386291504, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7269909381866455, + "num_tokens": 551791860.0, + "step": 21324 + }, + { + "epoch": 2.3418625082363276, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.266333818435669, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7161334753036499, + "num_tokens": 551820556.0, + "step": 21325 + }, + { + "epoch": 2.3419723259389413, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4854652881622314, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.725105881690979, + "num_tokens": 551845083.0, + "step": 21326 + }, + { + "epoch": 2.342082143641555, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6264944076538086, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7424129247665405, + "num_tokens": 551866820.0, + "step": 21327 + }, + { + "epoch": 2.342191961344169, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.49505352973938, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7223425507545471, + "num_tokens": 551890731.0, + "step": 21328 + }, + { + "epoch": 2.342301779046782, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2995681762695312, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7078924775123596, + "num_tokens": 551919352.0, + "step": 21329 + }, + { + "epoch": 2.342411596749396, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5836827754974365, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7058131694793701, + "num_tokens": 551946592.0, + "step": 21330 + }, + { + "epoch": 2.3425214144520097, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.446065664291382, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.7036974430084229, + "num_tokens": 551972924.0, + "step": 21331 + }, + { + "epoch": 2.3426312321546234, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.608703374862671, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7394616007804871, + "num_tokens": 551994666.0, + "step": 21332 + }, + { + "epoch": 2.3427410498572367, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.476525068283081, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6992192268371582, + "num_tokens": 552021422.0, + "step": 21333 + }, + { + "epoch": 2.3428508675598505, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.271271228790283, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7088176012039185, + "num_tokens": 552051876.0, + "step": 21334 + }, + { + "epoch": 2.3429606852624643, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7308268547058105, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7265632152557373, + "num_tokens": 552074281.0, + "step": 21335 + }, + { + "epoch": 2.343070502965078, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.439760684967041, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7181627750396729, + "num_tokens": 552101939.0, + "step": 21336 + }, + { + "epoch": 2.3431803206676918, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.255786180496216, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7208567261695862, + "num_tokens": 552133240.0, + "step": 21337 + }, + { + "epoch": 2.343290138370305, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3065781593322754, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7300107479095459, + "num_tokens": 552160652.0, + "step": 21338 + }, + { + "epoch": 2.343399956072919, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.352015256881714, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.7035106420516968, + "num_tokens": 552188326.0, + "step": 21339 + }, + { + "epoch": 2.3435097737755326, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.524085283279419, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7338598370552063, + "num_tokens": 552211141.0, + "step": 21340 + }, + { + "epoch": 2.3436195914781464, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.67989182472229, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7140421867370605, + "num_tokens": 552235120.0, + "step": 21341 + }, + { + "epoch": 2.34372940918076, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.592194080352783, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.728604793548584, + "num_tokens": 552257858.0, + "step": 21342 + }, + { + "epoch": 2.3438392268833734, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3424174785614014, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7237759232521057, + "num_tokens": 552286546.0, + "step": 21343 + }, + { + "epoch": 2.343949044585987, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5023081302642822, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7403326034545898, + "num_tokens": 552308667.0, + "step": 21344 + }, + { + "epoch": 2.344058862288601, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.311734199523926, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6768665313720703, + "num_tokens": 552342909.0, + "step": 21345 + }, + { + "epoch": 2.3441686799912147, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.459305763244629, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7352715730667114, + "num_tokens": 552368696.0, + "step": 21346 + }, + { + "epoch": 2.3442784976938285, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3813130855560303, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6924099922180176, + "num_tokens": 552395639.0, + "step": 21347 + }, + { + "epoch": 2.3443883153964418, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5540637969970703, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.7022531032562256, + "num_tokens": 552420544.0, + "step": 21348 + }, + { + "epoch": 2.3444981330990555, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.90962815284729, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7287039756774902, + "num_tokens": 552439038.0, + "step": 21349 + }, + { + "epoch": 2.3446079508016693, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4356870651245117, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7204993367195129, + "num_tokens": 552464579.0, + "step": 21350 + }, + { + "epoch": 2.344717768504283, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2592055797576904, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7092745900154114, + "num_tokens": 552495383.0, + "step": 21351 + }, + { + "epoch": 2.344827586206897, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.381808042526245, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7081566452980042, + "num_tokens": 552520115.0, + "step": 21352 + }, + { + "epoch": 2.34493740390951, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.626525402069092, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7210731506347656, + "num_tokens": 552541490.0, + "step": 21353 + }, + { + "epoch": 2.345047221612124, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.35581636428833, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.708294689655304, + "num_tokens": 552571728.0, + "step": 21354 + }, + { + "epoch": 2.3451570393147376, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.350728750228882, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7198559045791626, + "num_tokens": 552599496.0, + "step": 21355 + }, + { + "epoch": 2.3452668570173514, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.486239433288574, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7025629878044128, + "num_tokens": 552627444.0, + "step": 21356 + }, + { + "epoch": 2.3453766747199647, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.578080177307129, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7147886753082275, + "num_tokens": 552651759.0, + "step": 21357 + }, + { + "epoch": 2.3454864924225785, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1636722087860107, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7178561687469482, + "num_tokens": 552684943.0, + "step": 21358 + }, + { + "epoch": 2.345596310125192, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.583437204360962, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7115020751953125, + "num_tokens": 552708085.0, + "step": 21359 + }, + { + "epoch": 2.345706127827806, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.317098617553711, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6994642019271851, + "num_tokens": 552738251.0, + "step": 21360 + }, + { + "epoch": 2.3458159455304193, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.24997615814209, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7231159210205078, + "num_tokens": 552770263.0, + "step": 21361 + }, + { + "epoch": 2.345925763233033, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2893450260162354, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6973431706428528, + "num_tokens": 552799588.0, + "step": 21362 + }, + { + "epoch": 2.346035580935647, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4047136306762695, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7106893062591553, + "num_tokens": 552825885.0, + "step": 21363 + }, + { + "epoch": 2.3461453986382605, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.635676145553589, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7054011225700378, + "num_tokens": 552849550.0, + "step": 21364 + }, + { + "epoch": 2.3462552163408743, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.840881109237671, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7291470170021057, + "num_tokens": 552870271.0, + "step": 21365 + }, + { + "epoch": 2.3463650340434876, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.221428155899048, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7274853587150574, + "num_tokens": 552897487.0, + "step": 21366 + }, + { + "epoch": 2.3464748517461014, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2479569911956787, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6979954242706299, + "num_tokens": 552931433.0, + "step": 21367 + }, + { + "epoch": 2.346584669448715, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7679998874664307, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7202185988426208, + "num_tokens": 552952389.0, + "step": 21368 + }, + { + "epoch": 2.346694487151329, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4465668201446533, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7165327072143555, + "num_tokens": 552980525.0, + "step": 21369 + }, + { + "epoch": 2.3468043048539426, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.327563762664795, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7125028371810913, + "num_tokens": 553006900.0, + "step": 21370 + }, + { + "epoch": 2.346914122556556, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.487114906311035, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7074766159057617, + "num_tokens": 553032119.0, + "step": 21371 + }, + { + "epoch": 2.3470239402591697, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7873241901397705, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7372530698776245, + "num_tokens": 553051757.0, + "step": 21372 + }, + { + "epoch": 2.3471337579617835, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 3.170933961868286, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7360679507255554, + "num_tokens": 553068911.0, + "step": 21373 + }, + { + "epoch": 2.3472435756643972, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6005687713623047, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7264310121536255, + "num_tokens": 553091522.0, + "step": 21374 + }, + { + "epoch": 2.347353393367011, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.630678415298462, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7068202495574951, + "num_tokens": 553114261.0, + "step": 21375 + }, + { + "epoch": 2.3474632110696243, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4669113159179688, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.705863356590271, + "num_tokens": 553138696.0, + "step": 21376 + }, + { + "epoch": 2.347573028772238, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.460820198059082, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6974620223045349, + "num_tokens": 553164347.0, + "step": 21377 + }, + { + "epoch": 2.347682846474852, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2227885723114014, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6914132833480835, + "num_tokens": 553196295.0, + "step": 21378 + }, + { + "epoch": 2.3477926641774656, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2455217838287354, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7054293155670166, + "num_tokens": 553224688.0, + "step": 21379 + }, + { + "epoch": 2.347902481880079, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.478733539581299, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7175044417381287, + "num_tokens": 553248386.0, + "step": 21380 + }, + { + "epoch": 2.3480122995826926, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4603962898254395, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7030292749404907, + "num_tokens": 553275638.0, + "step": 21381 + }, + { + "epoch": 2.3481221172853064, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8025808334350586, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7404910922050476, + "num_tokens": 553294839.0, + "step": 21382 + }, + { + "epoch": 2.34823193498792, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.670506238937378, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7364023923873901, + "num_tokens": 553316714.0, + "step": 21383 + }, + { + "epoch": 2.3483417526905335, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2920777797698975, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7037920355796814, + "num_tokens": 553346341.0, + "step": 21384 + }, + { + "epoch": 2.3484515703931472, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.608774185180664, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7074154615402222, + "num_tokens": 553368820.0, + "step": 21385 + }, + { + "epoch": 2.348561388095761, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.64343523979187, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7351164221763611, + "num_tokens": 553390215.0, + "step": 21386 + }, + { + "epoch": 2.3486712057983747, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.44800066947937, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7083443403244019, + "num_tokens": 553416577.0, + "step": 21387 + }, + { + "epoch": 2.3487810235009885, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3670706748962402, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7302217483520508, + "num_tokens": 553443738.0, + "step": 21388 + }, + { + "epoch": 2.348890841203602, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.732933282852173, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7065507769584656, + "num_tokens": 553465023.0, + "step": 21389 + }, + { + "epoch": 2.3490006589062156, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5426201820373535, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6935778856277466, + "num_tokens": 553489967.0, + "step": 21390 + }, + { + "epoch": 2.3491104766088293, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3662049770355225, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.726496696472168, + "num_tokens": 553516728.0, + "step": 21391 + }, + { + "epoch": 2.349220294311443, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.590820074081421, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.725145697593689, + "num_tokens": 553538784.0, + "step": 21392 + }, + { + "epoch": 2.349330112014057, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.2898800373077393, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6795934438705444, + "num_tokens": 553568410.0, + "step": 21393 + }, + { + "epoch": 2.34943992971667, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.29685378074646, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7336448431015015, + "num_tokens": 553597423.0, + "step": 21394 + }, + { + "epoch": 2.349549747419284, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7542617321014404, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7187492847442627, + "num_tokens": 553619223.0, + "step": 21395 + }, + { + "epoch": 2.3496595651218977, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.1850578784942627, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6996586322784424, + "num_tokens": 553648604.0, + "step": 21396 + }, + { + "epoch": 2.3497693828245114, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.749239444732666, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7262939810752869, + "num_tokens": 553668209.0, + "step": 21397 + }, + { + "epoch": 2.349879200527125, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3977088928222656, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7165030241012573, + "num_tokens": 553693394.0, + "step": 21398 + }, + { + "epoch": 2.3499890182297385, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.69053316116333, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.716168999671936, + "num_tokens": 553714622.0, + "step": 21399 + }, + { + "epoch": 2.3500988359323522, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4501943588256836, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7381453514099121, + "num_tokens": 553738769.0, + "step": 21400 + }, + { + "epoch": 2.350208653634966, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6564292907714844, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6882296800613403, + "num_tokens": 553762917.0, + "step": 21401 + }, + { + "epoch": 2.3503184713375798, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.067906618118286, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.709089994430542, + "num_tokens": 553798724.0, + "step": 21402 + }, + { + "epoch": 2.3504282890401935, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8684768676757812, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.719068169593811, + "num_tokens": 553819757.0, + "step": 21403 + }, + { + "epoch": 2.350538106742807, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3906569480895996, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7196097373962402, + "num_tokens": 553846996.0, + "step": 21404 + }, + { + "epoch": 2.3506479244454206, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4909093379974365, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7050121426582336, + "num_tokens": 553871149.0, + "step": 21405 + }, + { + "epoch": 2.3507577421480343, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2241852283477783, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7012118101119995, + "num_tokens": 553904582.0, + "step": 21406 + }, + { + "epoch": 2.350867559850648, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6834144592285156, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7299020290374756, + "num_tokens": 553925932.0, + "step": 21407 + }, + { + "epoch": 2.3509773775532614, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5036978721618652, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7325371503829956, + "num_tokens": 553950760.0, + "step": 21408 + }, + { + "epoch": 2.351087195255875, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.299360513687134, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6978273987770081, + "num_tokens": 553980795.0, + "step": 21409 + }, + { + "epoch": 2.351197012958489, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.365471601486206, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7386053204536438, + "num_tokens": 554007473.0, + "step": 21410 + }, + { + "epoch": 2.3513068306611027, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3064229488372803, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6997705698013306, + "num_tokens": 554036706.0, + "step": 21411 + }, + { + "epoch": 2.351416648363716, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8858823776245117, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7502949237823486, + "num_tokens": 554055256.0, + "step": 21412 + }, + { + "epoch": 2.3515264660663298, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.527808427810669, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.717329740524292, + "num_tokens": 554081341.0, + "step": 21413 + }, + { + "epoch": 2.3516362837689435, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2772903442382812, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7133352756500244, + "num_tokens": 554110544.0, + "step": 21414 + }, + { + "epoch": 2.3517461014715573, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3963184356689453, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7131067514419556, + "num_tokens": 554136803.0, + "step": 21415 + }, + { + "epoch": 2.351855919174171, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.351963520050049, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7107212543487549, + "num_tokens": 554164323.0, + "step": 21416 + }, + { + "epoch": 2.3519657368767843, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3132400512695312, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.732032060623169, + "num_tokens": 554188916.0, + "step": 21417 + }, + { + "epoch": 2.352075554579398, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.636988401412964, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7231272459030151, + "num_tokens": 554210815.0, + "step": 21418 + }, + { + "epoch": 2.352185372282012, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6217501163482666, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7193472385406494, + "num_tokens": 554233278.0, + "step": 21419 + }, + { + "epoch": 2.3522951899846256, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5851874351501465, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7096790671348572, + "num_tokens": 554256319.0, + "step": 21420 + }, + { + "epoch": 2.3524050076872394, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8566973209381104, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7065479755401611, + "num_tokens": 554275103.0, + "step": 21421 + }, + { + "epoch": 2.3525148253898527, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3465256690979004, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7059140205383301, + "num_tokens": 554302972.0, + "step": 21422 + }, + { + "epoch": 2.3526246430924664, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6613125801086426, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7122363448143005, + "num_tokens": 554326597.0, + "step": 21423 + }, + { + "epoch": 2.35273446079508, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5163843631744385, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6989849805831909, + "num_tokens": 554350438.0, + "step": 21424 + }, + { + "epoch": 2.352844278497694, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7187881469726562, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7154664993286133, + "num_tokens": 554372166.0, + "step": 21425 + }, + { + "epoch": 2.3529540962003077, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5261709690093994, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7261910438537598, + "num_tokens": 554394105.0, + "step": 21426 + }, + { + "epoch": 2.353063913902921, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.552046060562134, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7056503295898438, + "num_tokens": 554419285.0, + "step": 21427 + }, + { + "epoch": 2.353173731605535, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.588005304336548, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7225587964057922, + "num_tokens": 554442757.0, + "step": 21428 + }, + { + "epoch": 2.3532835493081485, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4553446769714355, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6977204084396362, + "num_tokens": 554468867.0, + "step": 21429 + }, + { + "epoch": 2.3533933670107623, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3485727310180664, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7118747234344482, + "num_tokens": 554499879.0, + "step": 21430 + }, + { + "epoch": 2.3535031847133756, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.575359582901001, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7187953591346741, + "num_tokens": 554524143.0, + "step": 21431 + }, + { + "epoch": 2.3536130024159894, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6358914375305176, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7207430601119995, + "num_tokens": 554546452.0, + "step": 21432 + }, + { + "epoch": 2.353722820118603, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.45198392868042, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7258436679840088, + "num_tokens": 554571337.0, + "step": 21433 + }, + { + "epoch": 2.353832637821217, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.7174696922302246, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7158844470977783, + "num_tokens": 554595132.0, + "step": 21434 + }, + { + "epoch": 2.3539424555238306, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3993096351623535, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.707396388053894, + "num_tokens": 554622249.0, + "step": 21435 + }, + { + "epoch": 2.354052273226444, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.171685218811035, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.7030502557754517, + "num_tokens": 554654011.0, + "step": 21436 + }, + { + "epoch": 2.3541620909290577, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3462889194488525, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7159126400947571, + "num_tokens": 554681248.0, + "step": 21437 + }, + { + "epoch": 2.3542719086316715, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4302890300750732, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7308753132820129, + "num_tokens": 554704839.0, + "step": 21438 + }, + { + "epoch": 2.354381726334285, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1238059997558594, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7117269039154053, + "num_tokens": 554736566.0, + "step": 21439 + }, + { + "epoch": 2.3544915440368985, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6020188331604004, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7158271074295044, + "num_tokens": 554759453.0, + "step": 21440 + }, + { + "epoch": 2.3546013617395123, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4176247119903564, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7149979472160339, + "num_tokens": 554785923.0, + "step": 21441 + }, + { + "epoch": 2.354711179442126, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.743126392364502, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7063409090042114, + "num_tokens": 554807141.0, + "step": 21442 + }, + { + "epoch": 2.35482099714474, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4398186206817627, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7530529499053955, + "num_tokens": 554830120.0, + "step": 21443 + }, + { + "epoch": 2.3549308148473536, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.5275800228118896, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7158428430557251, + "num_tokens": 554854882.0, + "step": 21444 + }, + { + "epoch": 2.355040632549967, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.565809965133667, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6885521411895752, + "num_tokens": 554879697.0, + "step": 21445 + }, + { + "epoch": 2.3551504502525806, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6293373107910156, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7362182140350342, + "num_tokens": 554901742.0, + "step": 21446 + }, + { + "epoch": 2.3552602679551944, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4810588359832764, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7100839018821716, + "num_tokens": 554927477.0, + "step": 21447 + }, + { + "epoch": 2.355370085657808, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.9015705585479736, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7484020590782166, + "num_tokens": 554945121.0, + "step": 21448 + }, + { + "epoch": 2.355479903360422, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4981446266174316, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7079539895057678, + "num_tokens": 554971482.0, + "step": 21449 + }, + { + "epoch": 2.355589721063035, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6748790740966797, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7105312347412109, + "num_tokens": 554991702.0, + "step": 21450 + }, + { + "epoch": 2.355699538765649, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.605518102645874, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7363204956054688, + "num_tokens": 555012775.0, + "step": 21451 + }, + { + "epoch": 2.3558093564682627, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3487796783447266, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7084022760391235, + "num_tokens": 555042151.0, + "step": 21452 + }, + { + "epoch": 2.3559191741708765, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3466122150421143, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7166222333908081, + "num_tokens": 555069862.0, + "step": 21453 + }, + { + "epoch": 2.3560289918734902, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.721222400665283, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7307118773460388, + "num_tokens": 555092922.0, + "step": 21454 + }, + { + "epoch": 2.3561388095761036, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 3.009356737136841, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7234095335006714, + "num_tokens": 555119169.0, + "step": 21455 + }, + { + "epoch": 2.3562486272787173, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.52763295173645, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7134061455726624, + "num_tokens": 555145956.0, + "step": 21456 + }, + { + "epoch": 2.356358444981331, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.533966064453125, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7018741965293884, + "num_tokens": 555170733.0, + "step": 21457 + }, + { + "epoch": 2.356468262683945, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7169113159179688, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.740662693977356, + "num_tokens": 555190957.0, + "step": 21458 + }, + { + "epoch": 2.356578080386558, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.803497314453125, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7176719307899475, + "num_tokens": 555211908.0, + "step": 21459 + }, + { + "epoch": 2.356687898089172, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4919800758361816, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7103662490844727, + "num_tokens": 555236327.0, + "step": 21460 + }, + { + "epoch": 2.3567977157917857, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.6567976474761963, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7210926413536072, + "num_tokens": 555260806.0, + "step": 21461 + }, + { + "epoch": 2.3569075334943994, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.4753870964050293, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7043490409851074, + "num_tokens": 555290160.0, + "step": 21462 + }, + { + "epoch": 2.3570173511970127, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.8640379905700684, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7205709218978882, + "num_tokens": 555310957.0, + "step": 21463 + }, + { + "epoch": 2.3571271688996265, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.528999090194702, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7101185917854309, + "num_tokens": 555335161.0, + "step": 21464 + }, + { + "epoch": 2.3572369866022402, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.3154022693634033, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7076959013938904, + "num_tokens": 555362095.0, + "step": 21465 + }, + { + "epoch": 2.357346804304854, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.586031913757324, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7295882701873779, + "num_tokens": 555384986.0, + "step": 21466 + }, + { + "epoch": 2.3574566220074678, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.574167251586914, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7190746665000916, + "num_tokens": 555410007.0, + "step": 21467 + }, + { + "epoch": 2.357566439710081, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3036563396453857, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7073268890380859, + "num_tokens": 555441338.0, + "step": 21468 + }, + { + "epoch": 2.357676257412695, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3294482231140137, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6971397995948792, + "num_tokens": 555470582.0, + "step": 21469 + }, + { + "epoch": 2.3577860751153086, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2314562797546387, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.71611487865448, + "num_tokens": 555503886.0, + "step": 21470 + }, + { + "epoch": 2.3578958928179223, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.8479700088500977, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7175121903419495, + "num_tokens": 555528617.0, + "step": 21471 + }, + { + "epoch": 2.358005710520536, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.375880241394043, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7239944934844971, + "num_tokens": 555555610.0, + "step": 21472 + }, + { + "epoch": 2.3581155282231494, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.409893274307251, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.688197672367096, + "num_tokens": 555581989.0, + "step": 21473 + }, + { + "epoch": 2.358225345925763, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.772385597229004, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7185907959938049, + "num_tokens": 555604084.0, + "step": 21474 + }, + { + "epoch": 2.358335163628377, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4033942222595215, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7282613515853882, + "num_tokens": 555631310.0, + "step": 21475 + }, + { + "epoch": 2.3584449813309907, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.409346342086792, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7053198218345642, + "num_tokens": 555661471.0, + "step": 21476 + }, + { + "epoch": 2.3585547990336044, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.479961633682251, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7351655960083008, + "num_tokens": 555686611.0, + "step": 21477 + }, + { + "epoch": 2.3586646167362177, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.657754421234131, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7293709516525269, + "num_tokens": 555706852.0, + "step": 21478 + }, + { + "epoch": 2.3587744344388315, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 2.424511671066284, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7387272715568542, + "num_tokens": 555731573.0, + "step": 21479 + }, + { + "epoch": 2.3588842521414453, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6481423377990723, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7387006282806396, + "num_tokens": 555754399.0, + "step": 21480 + }, + { + "epoch": 2.358994069844059, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.589092254638672, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.7012709975242615, + "num_tokens": 555779355.0, + "step": 21481 + }, + { + "epoch": 2.3591038875466728, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6295387744903564, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7242929935455322, + "num_tokens": 555801736.0, + "step": 21482 + }, + { + "epoch": 2.359213705249286, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5327699184417725, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7133842706680298, + "num_tokens": 555828488.0, + "step": 21483 + }, + { + "epoch": 2.3593235229519, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5169785022735596, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7384876012802124, + "num_tokens": 555853675.0, + "step": 21484 + }, + { + "epoch": 2.3594333406545136, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.530796527862549, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7277268767356873, + "num_tokens": 555877499.0, + "step": 21485 + }, + { + "epoch": 2.3595431583571274, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3232028484344482, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7100591063499451, + "num_tokens": 555904816.0, + "step": 21486 + }, + { + "epoch": 2.3596529760597407, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.473261833190918, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7249436974525452, + "num_tokens": 555930791.0, + "step": 21487 + }, + { + "epoch": 2.3597627937623544, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.557101249694824, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.708014726638794, + "num_tokens": 555954763.0, + "step": 21488 + }, + { + "epoch": 2.359872611464968, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4778847694396973, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7446151971817017, + "num_tokens": 555978907.0, + "step": 21489 + }, + { + "epoch": 2.359982429167582, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.468703269958496, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7123674154281616, + "num_tokens": 556004729.0, + "step": 21490 + }, + { + "epoch": 2.3600922468701953, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3795456886291504, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7050083875656128, + "num_tokens": 556031476.0, + "step": 21491 + }, + { + "epoch": 2.360202064572809, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5253124237060547, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7156998515129089, + "num_tokens": 556054552.0, + "step": 21492 + }, + { + "epoch": 2.3603118822754228, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.648191213607788, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6983957290649414, + "num_tokens": 556079224.0, + "step": 21493 + }, + { + "epoch": 2.3604216999780365, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7424418926239014, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7261285781860352, + "num_tokens": 556099862.0, + "step": 21494 + }, + { + "epoch": 2.3605315176806503, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.253941059112549, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.7000598311424255, + "num_tokens": 556131139.0, + "step": 21495 + }, + { + "epoch": 2.3606413353832636, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5028843879699707, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7268674969673157, + "num_tokens": 556154231.0, + "step": 21496 + }, + { + "epoch": 2.3607511530858774, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2161333560943604, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.7085751295089722, + "num_tokens": 556185699.0, + "step": 21497 + }, + { + "epoch": 2.360860970788491, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.206695318222046, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7266523838043213, + "num_tokens": 556213295.0, + "step": 21498 + }, + { + "epoch": 2.360970788491105, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.59835147857666, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7010703682899475, + "num_tokens": 556236909.0, + "step": 21499 + }, + { + "epoch": 2.3610806061937186, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6689469814300537, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7180070877075195, + "num_tokens": 556260911.0, + "step": 21500 + }, + { + "epoch": 2.361190423896332, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6616055965423584, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7303322553634644, + "num_tokens": 556284155.0, + "step": 21501 + }, + { + "epoch": 2.3613002415989457, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4348196983337402, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.707165002822876, + "num_tokens": 556311007.0, + "step": 21502 + }, + { + "epoch": 2.3614100593015594, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.95624041557312, + "learning_rate": 1e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7553277015686035, + "num_tokens": 556327874.0, + "step": 21503 + }, + { + "epoch": 2.361519877004173, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.408708333969116, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7203578352928162, + "num_tokens": 556354663.0, + "step": 21504 + }, + { + "epoch": 2.361629694706787, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4114480018615723, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7453983426094055, + "num_tokens": 556380131.0, + "step": 21505 + }, + { + "epoch": 2.3617395124094003, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.222776174545288, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7166057825088501, + "num_tokens": 556412535.0, + "step": 21506 + }, + { + "epoch": 2.361849330112014, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.398226737976074, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.710686206817627, + "num_tokens": 556438374.0, + "step": 21507 + }, + { + "epoch": 2.361959147814628, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6054892539978027, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7127711772918701, + "num_tokens": 556461616.0, + "step": 21508 + }, + { + "epoch": 2.3620689655172415, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5459508895874023, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7236799001693726, + "num_tokens": 556483612.0, + "step": 21509 + }, + { + "epoch": 2.362178783219855, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.460651397705078, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7249847650527954, + "num_tokens": 556508354.0, + "step": 21510 + }, + { + "epoch": 2.3622886009224686, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.471341133117676, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7217473983764648, + "num_tokens": 556533953.0, + "step": 21511 + }, + { + "epoch": 2.3623984186250824, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4501800537109375, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7140580415725708, + "num_tokens": 556559964.0, + "step": 21512 + }, + { + "epoch": 2.362508236327696, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.33146595954895, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7041029930114746, + "num_tokens": 556591345.0, + "step": 21513 + }, + { + "epoch": 2.3626180540303094, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.8478145599365234, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.719550609588623, + "num_tokens": 556610964.0, + "step": 21514 + }, + { + "epoch": 2.362727871732923, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.459564685821533, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7316298484802246, + "num_tokens": 556635293.0, + "step": 21515 + }, + { + "epoch": 2.362837689435537, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.313275098800659, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6950002908706665, + "num_tokens": 556663735.0, + "step": 21516 + }, + { + "epoch": 2.3629475071381507, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7292263507843018, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.715601921081543, + "num_tokens": 556686187.0, + "step": 21517 + }, + { + "epoch": 2.3630573248407645, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3223907947540283, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7071025967597961, + "num_tokens": 556714669.0, + "step": 21518 + }, + { + "epoch": 2.363167142543378, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3367843627929688, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7128812074661255, + "num_tokens": 556741696.0, + "step": 21519 + }, + { + "epoch": 2.3632769602459915, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.505801200866699, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7230300903320312, + "num_tokens": 556763765.0, + "step": 21520 + }, + { + "epoch": 2.3633867779486053, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3713653087615967, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7088325023651123, + "num_tokens": 556790514.0, + "step": 21521 + }, + { + "epoch": 2.363496595651219, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6426620483398438, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7130175828933716, + "num_tokens": 556813619.0, + "step": 21522 + }, + { + "epoch": 2.363606413353833, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3372631072998047, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7263053059577942, + "num_tokens": 556841250.0, + "step": 21523 + }, + { + "epoch": 2.363716231056446, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.600162982940674, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7314856648445129, + "num_tokens": 556863528.0, + "step": 21524 + }, + { + "epoch": 2.36382604875906, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2894833087921143, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7098800539970398, + "num_tokens": 556892962.0, + "step": 21525 + }, + { + "epoch": 2.3639358664616736, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.365225315093994, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7077051401138306, + "num_tokens": 556920673.0, + "step": 21526 + }, + { + "epoch": 2.3640456841642874, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.558518886566162, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7433204650878906, + "num_tokens": 556943449.0, + "step": 21527 + }, + { + "epoch": 2.364155501866901, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.227727174758911, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7012012004852295, + "num_tokens": 556972204.0, + "step": 21528 + }, + { + "epoch": 2.3642653195695145, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3272104263305664, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.690352201461792, + "num_tokens": 557002786.0, + "step": 21529 + }, + { + "epoch": 2.3643751372721282, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.59236216545105, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6913367509841919, + "num_tokens": 557028432.0, + "step": 21530 + }, + { + "epoch": 2.364484954974742, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5424764156341553, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6963269710540771, + "num_tokens": 557053635.0, + "step": 21531 + }, + { + "epoch": 2.3645947726773557, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4659218788146973, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7209115624427795, + "num_tokens": 557080238.0, + "step": 21532 + }, + { + "epoch": 2.3647045903799695, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.557932138442993, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7223067283630371, + "num_tokens": 557102780.0, + "step": 21533 + }, + { + "epoch": 2.364814408082583, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4589035511016846, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7035275101661682, + "num_tokens": 557131178.0, + "step": 21534 + }, + { + "epoch": 2.3649242257851966, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6344876289367676, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7289024591445923, + "num_tokens": 557153394.0, + "step": 21535 + }, + { + "epoch": 2.3650340434878103, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2860498428344727, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7150020599365234, + "num_tokens": 557180490.0, + "step": 21536 + }, + { + "epoch": 2.365143861190424, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1272621154785156, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7130570411682129, + "num_tokens": 557213714.0, + "step": 21537 + }, + { + "epoch": 2.3652536788930374, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4869384765625, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7240189909934998, + "num_tokens": 557238581.0, + "step": 21538 + }, + { + "epoch": 2.365363496595651, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5158307552337646, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7034411430358887, + "num_tokens": 557264846.0, + "step": 21539 + }, + { + "epoch": 2.365473314298265, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.446134328842163, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7080968618392944, + "num_tokens": 557291147.0, + "step": 21540 + }, + { + "epoch": 2.3655831320008787, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.678845167160034, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7159291505813599, + "num_tokens": 557312834.0, + "step": 21541 + }, + { + "epoch": 2.365692949703492, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.385249376296997, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7284857034683228, + "num_tokens": 557339964.0, + "step": 21542 + }, + { + "epoch": 2.3658027674061057, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.423340082168579, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.729037344455719, + "num_tokens": 557366144.0, + "step": 21543 + }, + { + "epoch": 2.3659125851087195, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4960360527038574, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.706201434135437, + "num_tokens": 557391270.0, + "step": 21544 + }, + { + "epoch": 2.3660224028113332, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.395314931869507, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7136858105659485, + "num_tokens": 557417639.0, + "step": 21545 + }, + { + "epoch": 2.366132220513947, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4489409923553467, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7185392379760742, + "num_tokens": 557444990.0, + "step": 21546 + }, + { + "epoch": 2.3662420382165603, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6378767490386963, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6909018158912659, + "num_tokens": 557467022.0, + "step": 21547 + }, + { + "epoch": 2.366351855919174, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.323927164077759, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7049007415771484, + "num_tokens": 557495147.0, + "step": 21548 + }, + { + "epoch": 2.366461673621788, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.509406089782715, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7240463495254517, + "num_tokens": 557519017.0, + "step": 21549 + }, + { + "epoch": 2.3665714913244016, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5265889167785645, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7510567903518677, + "num_tokens": 557543080.0, + "step": 21550 + }, + { + "epoch": 2.3666813090270153, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.471879005432129, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7142963409423828, + "num_tokens": 557569638.0, + "step": 21551 + }, + { + "epoch": 2.3667911267296287, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3015973567962646, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.725990355014801, + "num_tokens": 557598536.0, + "step": 21552 + }, + { + "epoch": 2.3669009444322424, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.093859910964966, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7033557295799255, + "num_tokens": 557634436.0, + "step": 21553 + }, + { + "epoch": 2.367010762134856, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.9386637210845947, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7366259098052979, + "num_tokens": 557654703.0, + "step": 21554 + }, + { + "epoch": 2.36712057983747, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4089317321777344, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7102152109146118, + "num_tokens": 557683104.0, + "step": 21555 + }, + { + "epoch": 2.3672303975400837, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.300034284591675, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6987646818161011, + "num_tokens": 557715197.0, + "step": 21556 + }, + { + "epoch": 2.367340215242697, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.34272837638855, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.716544508934021, + "num_tokens": 557742148.0, + "step": 21557 + }, + { + "epoch": 2.3674500329453108, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2747139930725098, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7122570276260376, + "num_tokens": 557769927.0, + "step": 21558 + }, + { + "epoch": 2.3675598506479245, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5511410236358643, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.718985915184021, + "num_tokens": 557793071.0, + "step": 21559 + }, + { + "epoch": 2.3676696683505383, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 6.991238117218018, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7311826944351196, + "num_tokens": 557822869.0, + "step": 21560 + }, + { + "epoch": 2.3677794860531516, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2941839694976807, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7170513272285461, + "num_tokens": 557852209.0, + "step": 21561 + }, + { + "epoch": 2.3678893037557653, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.538156032562256, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7031649947166443, + "num_tokens": 557876654.0, + "step": 21562 + }, + { + "epoch": 2.367999121458379, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2638099193573, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.724734902381897, + "num_tokens": 557906940.0, + "step": 21563 + }, + { + "epoch": 2.368108939160993, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.262596845626831, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7058328986167908, + "num_tokens": 557938472.0, + "step": 21564 + }, + { + "epoch": 2.368218756863606, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.704069137573242, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7110544443130493, + "num_tokens": 557960421.0, + "step": 21565 + }, + { + "epoch": 2.36832857456622, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.669674873352051, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7273929119110107, + "num_tokens": 557981998.0, + "step": 21566 + }, + { + "epoch": 2.3684383922688337, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.9742274284362793, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7253428101539612, + "num_tokens": 558004848.0, + "step": 21567 + }, + { + "epoch": 2.3685482099714474, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.585158586502075, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7258157134056091, + "num_tokens": 558028237.0, + "step": 21568 + }, + { + "epoch": 2.368658027674061, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3278591632843018, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7108203172683716, + "num_tokens": 558058594.0, + "step": 21569 + }, + { + "epoch": 2.3687678453766745, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3986103534698486, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7372512221336365, + "num_tokens": 558084242.0, + "step": 21570 + }, + { + "epoch": 2.3688776630792883, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.275981903076172, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6865280866622925, + "num_tokens": 558115959.0, + "step": 21571 + }, + { + "epoch": 2.368987480781902, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.433075428009033, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7031271457672119, + "num_tokens": 558141050.0, + "step": 21572 + }, + { + "epoch": 2.369097298484516, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2752363681793213, + "learning_rate": 1e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6807252168655396, + "num_tokens": 558173423.0, + "step": 21573 + }, + { + "epoch": 2.3692071161871295, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.415735960006714, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7180333137512207, + "num_tokens": 558197401.0, + "step": 21574 + }, + { + "epoch": 2.369316933889743, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6926686763763428, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7202866077423096, + "num_tokens": 558220872.0, + "step": 21575 + }, + { + "epoch": 2.3694267515923566, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.523742914199829, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7162289619445801, + "num_tokens": 558245340.0, + "step": 21576 + }, + { + "epoch": 2.3695365692949704, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4210221767425537, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7368787527084351, + "num_tokens": 558269052.0, + "step": 21577 + }, + { + "epoch": 2.369646386997584, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.379323720932007, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.722084641456604, + "num_tokens": 558296135.0, + "step": 21578 + }, + { + "epoch": 2.369756204700198, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6679375171661377, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7072155475616455, + "num_tokens": 558319289.0, + "step": 21579 + }, + { + "epoch": 2.369866022402811, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5071768760681152, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7272137403488159, + "num_tokens": 558341681.0, + "step": 21580 + }, + { + "epoch": 2.369975840105425, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3320043087005615, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6955180764198303, + "num_tokens": 558369106.0, + "step": 21581 + }, + { + "epoch": 2.3700856578080387, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2970805168151855, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7156016826629639, + "num_tokens": 558398945.0, + "step": 21582 + }, + { + "epoch": 2.3701954755106525, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.8551082611083984, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.724141538143158, + "num_tokens": 558422580.0, + "step": 21583 + }, + { + "epoch": 2.370305293213266, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2754030227661133, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7232968211174011, + "num_tokens": 558450847.0, + "step": 21584 + }, + { + "epoch": 2.3704151109158795, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3637170791625977, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.71734619140625, + "num_tokens": 558475667.0, + "step": 21585 + }, + { + "epoch": 2.3705249286184933, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2798397541046143, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7262155413627625, + "num_tokens": 558504062.0, + "step": 21586 + }, + { + "epoch": 2.370634746321107, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.478658676147461, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7244873046875, + "num_tokens": 558527171.0, + "step": 21587 + }, + { + "epoch": 2.370744564023721, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.8514678478240967, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7248414158821106, + "num_tokens": 558546295.0, + "step": 21588 + }, + { + "epoch": 2.370854381726334, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6755576133728027, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7057522535324097, + "num_tokens": 558568236.0, + "step": 21589 + }, + { + "epoch": 2.370964199428948, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2847564220428467, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6906982064247131, + "num_tokens": 558598369.0, + "step": 21590 + }, + { + "epoch": 2.3710740171315616, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 4.006366729736328, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7440410256385803, + "num_tokens": 558620381.0, + "step": 21591 + }, + { + "epoch": 2.3711838348341754, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6737732887268066, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7234863638877869, + "num_tokens": 558642173.0, + "step": 21592 + }, + { + "epoch": 2.3712936525367887, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4709787368774414, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7079072594642639, + "num_tokens": 558669174.0, + "step": 21593 + }, + { + "epoch": 2.3714034702394025, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.745450735092163, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.721576452255249, + "num_tokens": 558691184.0, + "step": 21594 + }, + { + "epoch": 2.371513287942016, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.0056424140930176, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7406643629074097, + "num_tokens": 558707598.0, + "step": 21595 + }, + { + "epoch": 2.37162310564463, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.241039991378784, + "learning_rate": 1e-06, + "loss": 1.1025, + "mean_token_accuracy": 0.6780251264572144, + "num_tokens": 558744396.0, + "step": 21596 + }, + { + "epoch": 2.3717329233472437, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5829145908355713, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7266541719436646, + "num_tokens": 558769673.0, + "step": 21597 + }, + { + "epoch": 2.371842741049857, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5632433891296387, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7159962058067322, + "num_tokens": 558795749.0, + "step": 21598 + }, + { + "epoch": 2.371952558752471, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2139713764190674, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7304147481918335, + "num_tokens": 558826193.0, + "step": 21599 + }, + { + "epoch": 2.3720623764550846, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5171828269958496, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7347755432128906, + "num_tokens": 558850118.0, + "step": 21600 + }, + { + "epoch": 2.3721721941576983, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7250919342041016, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7143433094024658, + "num_tokens": 558870899.0, + "step": 21601 + }, + { + "epoch": 2.372282011860312, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4127142429351807, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7091650366783142, + "num_tokens": 558897874.0, + "step": 21602 + }, + { + "epoch": 2.3723918295629254, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.436676502227783, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7058213949203491, + "num_tokens": 558925023.0, + "step": 21603 + }, + { + "epoch": 2.372501647265539, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5193848609924316, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7198039293289185, + "num_tokens": 558950567.0, + "step": 21604 + }, + { + "epoch": 2.372611464968153, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.487717390060425, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7263020873069763, + "num_tokens": 558974871.0, + "step": 21605 + }, + { + "epoch": 2.3727212826707667, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2460408210754395, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7051884531974792, + "num_tokens": 559005912.0, + "step": 21606 + }, + { + "epoch": 2.3728311003733804, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.370255470275879, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7133272886276245, + "num_tokens": 559032318.0, + "step": 21607 + }, + { + "epoch": 2.3729409180759937, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.9181602001190186, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7365419268608093, + "num_tokens": 559050923.0, + "step": 21608 + }, + { + "epoch": 2.3730507357786075, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.141935110092163, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7366347908973694, + "num_tokens": 559081779.0, + "step": 21609 + }, + { + "epoch": 2.3731605534812212, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7371792793273926, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7070459127426147, + "num_tokens": 559101853.0, + "step": 21610 + }, + { + "epoch": 2.373270371183835, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6814019680023193, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6972357630729675, + "num_tokens": 559125924.0, + "step": 21611 + }, + { + "epoch": 2.3733801888864483, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2383527755737305, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7254894375801086, + "num_tokens": 559155653.0, + "step": 21612 + }, + { + "epoch": 2.373490006589062, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6906135082244873, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7179771661758423, + "num_tokens": 559179068.0, + "step": 21613 + }, + { + "epoch": 2.373599824291676, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.368755578994751, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7049461603164673, + "num_tokens": 559206875.0, + "step": 21614 + }, + { + "epoch": 2.3737096419942896, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3369252681732178, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.713141918182373, + "num_tokens": 559235853.0, + "step": 21615 + }, + { + "epoch": 2.3738194596969033, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7877285480499268, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7452520132064819, + "num_tokens": 559256191.0, + "step": 21616 + }, + { + "epoch": 2.3739292773995166, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6896352767944336, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.7003408074378967, + "num_tokens": 559278183.0, + "step": 21617 + }, + { + "epoch": 2.3740390951021304, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6652960777282715, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7239664196968079, + "num_tokens": 559299964.0, + "step": 21618 + }, + { + "epoch": 2.374148912804744, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5020406246185303, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7028605937957764, + "num_tokens": 559324445.0, + "step": 21619 + }, + { + "epoch": 2.374258730507358, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.492050886154175, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7120969295501709, + "num_tokens": 559349323.0, + "step": 21620 + }, + { + "epoch": 2.3743685482099712, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.465650796890259, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7073838710784912, + "num_tokens": 559378716.0, + "step": 21621 + }, + { + "epoch": 2.374478365912585, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.151087999343872, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6930447816848755, + "num_tokens": 559412640.0, + "step": 21622 + }, + { + "epoch": 2.3745881836151987, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7975809574127197, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7270422577857971, + "num_tokens": 559431472.0, + "step": 21623 + }, + { + "epoch": 2.3746980013178125, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6668221950531006, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7286198139190674, + "num_tokens": 559453465.0, + "step": 21624 + }, + { + "epoch": 2.3748078190204263, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4624736309051514, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6999270915985107, + "num_tokens": 559479660.0, + "step": 21625 + }, + { + "epoch": 2.3749176367230396, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2060129642486572, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7264436483383179, + "num_tokens": 559509927.0, + "step": 21626 + }, + { + "epoch": 2.3750274544256533, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5279417037963867, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7139270901679993, + "num_tokens": 559535535.0, + "step": 21627 + }, + { + "epoch": 2.375137272128267, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4136390686035156, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7243747115135193, + "num_tokens": 559563278.0, + "step": 21628 + }, + { + "epoch": 2.375247089830881, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3597235679626465, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.712998628616333, + "num_tokens": 559588563.0, + "step": 21629 + }, + { + "epoch": 2.3753569075334946, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3447115421295166, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6950751543045044, + "num_tokens": 559619051.0, + "step": 21630 + }, + { + "epoch": 2.375466725236108, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.628554105758667, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7394531965255737, + "num_tokens": 559639925.0, + "step": 21631 + }, + { + "epoch": 2.3755765429387217, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.207902431488037, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7364122867584229, + "num_tokens": 559671210.0, + "step": 21632 + }, + { + "epoch": 2.3756863606413354, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.660109043121338, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7110247611999512, + "num_tokens": 559694236.0, + "step": 21633 + }, + { + "epoch": 2.375796178343949, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7881805896759033, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7082290649414062, + "num_tokens": 559714548.0, + "step": 21634 + }, + { + "epoch": 2.375905996046563, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.782437801361084, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7026073932647705, + "num_tokens": 559744762.0, + "step": 21635 + }, + { + "epoch": 2.3760158137491763, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3717753887176514, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7140417098999023, + "num_tokens": 559770635.0, + "step": 21636 + }, + { + "epoch": 2.37612563145179, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.9183263778686523, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.713787317276001, + "num_tokens": 559791610.0, + "step": 21637 + }, + { + "epoch": 2.3762354491544038, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.604174852371216, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7136508822441101, + "num_tokens": 559814186.0, + "step": 21638 + }, + { + "epoch": 2.3763452668570175, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 8.616717338562012, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.7049779891967773, + "num_tokens": 559835190.0, + "step": 21639 + }, + { + "epoch": 2.376455084559631, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2555580139160156, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7057968974113464, + "num_tokens": 559866682.0, + "step": 21640 + }, + { + "epoch": 2.3765649022622446, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3444247245788574, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7077487707138062, + "num_tokens": 559895962.0, + "step": 21641 + }, + { + "epoch": 2.3766747199648584, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.59814190864563, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7360237836837769, + "num_tokens": 559918730.0, + "step": 21642 + }, + { + "epoch": 2.376784537667472, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5682809352874756, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7114496231079102, + "num_tokens": 559942407.0, + "step": 21643 + }, + { + "epoch": 2.3768943553700854, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4911422729492188, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6955927610397339, + "num_tokens": 559969006.0, + "step": 21644 + }, + { + "epoch": 2.377004173072699, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.527801990509033, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.715665876865387, + "num_tokens": 559992947.0, + "step": 21645 + }, + { + "epoch": 2.377113990775313, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.425337076187134, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.69089674949646, + "num_tokens": 560020395.0, + "step": 21646 + }, + { + "epoch": 2.3772238084779267, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.426525354385376, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7229728698730469, + "num_tokens": 560048467.0, + "step": 21647 + }, + { + "epoch": 2.3773336261805404, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.665712356567383, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7163363099098206, + "num_tokens": 560070449.0, + "step": 21648 + }, + { + "epoch": 2.3774434438831538, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1533265113830566, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6936383843421936, + "num_tokens": 560102206.0, + "step": 21649 + }, + { + "epoch": 2.3775532615857675, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.502211570739746, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7117727994918823, + "num_tokens": 560124865.0, + "step": 21650 + }, + { + "epoch": 2.3776630792883813, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5229430198669434, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7240793704986572, + "num_tokens": 560146976.0, + "step": 21651 + }, + { + "epoch": 2.377772896990995, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.721904993057251, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7284673452377319, + "num_tokens": 560166722.0, + "step": 21652 + }, + { + "epoch": 2.377882714693609, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6543657779693604, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7210716009140015, + "num_tokens": 560189172.0, + "step": 21653 + }, + { + "epoch": 2.377992532396222, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5563714504241943, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7061508297920227, + "num_tokens": 560214066.0, + "step": 21654 + }, + { + "epoch": 2.378102350098836, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1724233627319336, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7212957143783569, + "num_tokens": 560246232.0, + "step": 21655 + }, + { + "epoch": 2.3782121678014496, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3674561977386475, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7159916162490845, + "num_tokens": 560272804.0, + "step": 21656 + }, + { + "epoch": 2.3783219855040634, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5103776454925537, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7181763052940369, + "num_tokens": 560296297.0, + "step": 21657 + }, + { + "epoch": 2.378431803206677, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.397967576980591, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7015894651412964, + "num_tokens": 560319923.0, + "step": 21658 + }, + { + "epoch": 2.3785416209092904, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.561495065689087, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7084065675735474, + "num_tokens": 560345057.0, + "step": 21659 + }, + { + "epoch": 2.378651438611904, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.8362627029418945, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7178372740745544, + "num_tokens": 560365551.0, + "step": 21660 + }, + { + "epoch": 2.378761256314518, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.445230007171631, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7398579120635986, + "num_tokens": 560390724.0, + "step": 21661 + }, + { + "epoch": 2.3788710740171317, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7512526512145996, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.702786922454834, + "num_tokens": 560412951.0, + "step": 21662 + }, + { + "epoch": 2.3789808917197455, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5602047443389893, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.711125373840332, + "num_tokens": 560437061.0, + "step": 21663 + }, + { + "epoch": 2.379090709422359, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.531250476837158, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7423892021179199, + "num_tokens": 560459323.0, + "step": 21664 + }, + { + "epoch": 2.3792005271249725, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7399277687072754, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.707323431968689, + "num_tokens": 560480023.0, + "step": 21665 + }, + { + "epoch": 2.3793103448275863, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.554234027862549, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.704474687576294, + "num_tokens": 560507057.0, + "step": 21666 + }, + { + "epoch": 2.3794201625302, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.592290163040161, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.70392906665802, + "num_tokens": 560531806.0, + "step": 21667 + }, + { + "epoch": 2.3795299802328134, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2307915687561035, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7191579341888428, + "num_tokens": 560562967.0, + "step": 21668 + }, + { + "epoch": 2.379639797935427, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.415738821029663, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7291388511657715, + "num_tokens": 560587734.0, + "step": 21669 + }, + { + "epoch": 2.379749615638041, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.544394016265869, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7053908109664917, + "num_tokens": 560612771.0, + "step": 21670 + }, + { + "epoch": 2.3798594333406546, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.849689483642578, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7672639489173889, + "num_tokens": 560631954.0, + "step": 21671 + }, + { + "epoch": 2.379969251043268, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1937410831451416, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6847347021102905, + "num_tokens": 560664443.0, + "step": 21672 + }, + { + "epoch": 2.3800790687458817, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.515190601348877, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.715835690498352, + "num_tokens": 560689391.0, + "step": 21673 + }, + { + "epoch": 2.3801888864484955, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2694408893585205, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7085052132606506, + "num_tokens": 560717244.0, + "step": 21674 + }, + { + "epoch": 2.3802987041511092, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.44108247756958, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7325111627578735, + "num_tokens": 560738910.0, + "step": 21675 + }, + { + "epoch": 2.380408521853723, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1718873977661133, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.711679220199585, + "num_tokens": 560769723.0, + "step": 21676 + }, + { + "epoch": 2.3805183395563363, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1517181396484375, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7063734531402588, + "num_tokens": 560806610.0, + "step": 21677 + }, + { + "epoch": 2.38062815725895, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6036980152130127, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7085788249969482, + "num_tokens": 560829502.0, + "step": 21678 + }, + { + "epoch": 2.380737974961564, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.780388832092285, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7360260486602783, + "num_tokens": 560855250.0, + "step": 21679 + }, + { + "epoch": 2.3808477926641776, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.696765184402466, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7320616245269775, + "num_tokens": 560876357.0, + "step": 21680 + }, + { + "epoch": 2.3809576103667913, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5776140689849854, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6967878341674805, + "num_tokens": 560902717.0, + "step": 21681 + }, + { + "epoch": 2.3810674280694046, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3003385066986084, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7260206341743469, + "num_tokens": 560930540.0, + "step": 21682 + }, + { + "epoch": 2.3811772457720184, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5861172676086426, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.725075364112854, + "num_tokens": 560954786.0, + "step": 21683 + }, + { + "epoch": 2.381287063474632, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.42512845993042, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7173247933387756, + "num_tokens": 560981734.0, + "step": 21684 + }, + { + "epoch": 2.381396881177246, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.47582745552063, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7242922782897949, + "num_tokens": 561004489.0, + "step": 21685 + }, + { + "epoch": 2.3815066988798597, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7028801441192627, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7294163703918457, + "num_tokens": 561024652.0, + "step": 21686 + }, + { + "epoch": 2.381616516582473, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4345760345458984, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7242255210876465, + "num_tokens": 561049401.0, + "step": 21687 + }, + { + "epoch": 2.3817263342850867, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6993935108184814, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.728901743888855, + "num_tokens": 561071127.0, + "step": 21688 + }, + { + "epoch": 2.3818361519877005, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4058163166046143, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6972801089286804, + "num_tokens": 561095712.0, + "step": 21689 + }, + { + "epoch": 2.3819459696903142, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5167453289031982, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7264219522476196, + "num_tokens": 561118285.0, + "step": 21690 + }, + { + "epoch": 2.3820557873929276, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.337062358856201, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6967694759368896, + "num_tokens": 561147901.0, + "step": 21691 + }, + { + "epoch": 2.3821656050955413, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3360166549682617, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7088773250579834, + "num_tokens": 561176893.0, + "step": 21692 + }, + { + "epoch": 2.382275422798155, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2749035358428955, + "learning_rate": 1e-06, + "loss": 1.109, + "mean_token_accuracy": 0.6743278503417969, + "num_tokens": 561207332.0, + "step": 21693 + }, + { + "epoch": 2.382385240500769, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7187700271606445, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7227920293807983, + "num_tokens": 561228511.0, + "step": 21694 + }, + { + "epoch": 2.382495058203382, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3526015281677246, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.7092660665512085, + "num_tokens": 561256955.0, + "step": 21695 + }, + { + "epoch": 2.382604875905996, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5877909660339355, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7206045389175415, + "num_tokens": 561279513.0, + "step": 21696 + }, + { + "epoch": 2.3827146936086097, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.401507616043091, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7058018445968628, + "num_tokens": 561307164.0, + "step": 21697 + }, + { + "epoch": 2.3828245113112234, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.286154270172119, + "learning_rate": 1e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.6785231828689575, + "num_tokens": 561337569.0, + "step": 21698 + }, + { + "epoch": 2.382934329013837, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3273556232452393, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7032769918441772, + "num_tokens": 561367097.0, + "step": 21699 + }, + { + "epoch": 2.3830441467164505, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3768813610076904, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7087357640266418, + "num_tokens": 561394499.0, + "step": 21700 + }, + { + "epoch": 2.3831539644190642, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2949864864349365, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7143387198448181, + "num_tokens": 561422943.0, + "step": 21701 + }, + { + "epoch": 2.383263782121678, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4741475582122803, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7271476984024048, + "num_tokens": 561446362.0, + "step": 21702 + }, + { + "epoch": 2.3833735998242918, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.511932373046875, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7034083008766174, + "num_tokens": 561471673.0, + "step": 21703 + }, + { + "epoch": 2.3834834175269055, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.753812074661255, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7197248935699463, + "num_tokens": 561491792.0, + "step": 21704 + }, + { + "epoch": 2.383593235229519, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2402775287628174, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6918196678161621, + "num_tokens": 561523490.0, + "step": 21705 + }, + { + "epoch": 2.3837030529321326, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.687952995300293, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7157582640647888, + "num_tokens": 561546471.0, + "step": 21706 + }, + { + "epoch": 2.3838128706347463, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5556585788726807, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7342439889907837, + "num_tokens": 561568689.0, + "step": 21707 + }, + { + "epoch": 2.38392268833736, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.391497850418091, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7079412937164307, + "num_tokens": 561595060.0, + "step": 21708 + }, + { + "epoch": 2.384032506039974, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.452577590942383, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7310045957565308, + "num_tokens": 561619795.0, + "step": 21709 + }, + { + "epoch": 2.384142323742587, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.242511749267578, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7295782566070557, + "num_tokens": 561636235.0, + "step": 21710 + }, + { + "epoch": 2.384252141445201, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.216137647628784, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.733590841293335, + "num_tokens": 561665324.0, + "step": 21711 + }, + { + "epoch": 2.3843619591478147, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.581739902496338, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6971762180328369, + "num_tokens": 561686989.0, + "step": 21712 + }, + { + "epoch": 2.3844717768504284, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3992841243743896, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7077171802520752, + "num_tokens": 561713475.0, + "step": 21713 + }, + { + "epoch": 2.384581594553042, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.328626871109009, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7135039567947388, + "num_tokens": 561740807.0, + "step": 21714 + }, + { + "epoch": 2.3846914122556555, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.617990493774414, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7272162437438965, + "num_tokens": 561764139.0, + "step": 21715 + }, + { + "epoch": 2.3848012299582693, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2572884559631348, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.731220006942749, + "num_tokens": 561791526.0, + "step": 21716 + }, + { + "epoch": 2.384911047660883, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6724376678466797, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7162312269210815, + "num_tokens": 561813069.0, + "step": 21717 + }, + { + "epoch": 2.385020865363497, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3803718090057373, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7243683338165283, + "num_tokens": 561838720.0, + "step": 21718 + }, + { + "epoch": 2.38513068306611, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4445595741271973, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7224014401435852, + "num_tokens": 561864126.0, + "step": 21719 + }, + { + "epoch": 2.385240500768724, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.519310474395752, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7161100506782532, + "num_tokens": 561889744.0, + "step": 21720 + }, + { + "epoch": 2.3853503184713376, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.396672487258911, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.716062605381012, + "num_tokens": 561914889.0, + "step": 21721 + }, + { + "epoch": 2.3854601361739514, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.521634340286255, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7128490805625916, + "num_tokens": 561939297.0, + "step": 21722 + }, + { + "epoch": 2.3855699538765647, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3741648197174072, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7156762480735779, + "num_tokens": 561965447.0, + "step": 21723 + }, + { + "epoch": 2.3856797715791784, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4590370655059814, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6837432384490967, + "num_tokens": 561994561.0, + "step": 21724 + }, + { + "epoch": 2.385789589281792, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.0729382038116455, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7301269769668579, + "num_tokens": 562028541.0, + "step": 21725 + }, + { + "epoch": 2.385899406984406, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2299342155456543, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7189666628837585, + "num_tokens": 562059217.0, + "step": 21726 + }, + { + "epoch": 2.3860092246870197, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.376241445541382, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7249755263328552, + "num_tokens": 562085241.0, + "step": 21727 + }, + { + "epoch": 2.386119042389633, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4868433475494385, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7304255962371826, + "num_tokens": 562109433.0, + "step": 21728 + }, + { + "epoch": 2.3862288600922468, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.779414176940918, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7186025381088257, + "num_tokens": 562130464.0, + "step": 21729 + }, + { + "epoch": 2.3863386777948605, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.575277805328369, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7255550622940063, + "num_tokens": 562154334.0, + "step": 21730 + }, + { + "epoch": 2.3864484954974743, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4564526081085205, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7333593368530273, + "num_tokens": 562179353.0, + "step": 21731 + }, + { + "epoch": 2.386558313200088, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.35500168800354, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7136452794075012, + "num_tokens": 562209174.0, + "step": 21732 + }, + { + "epoch": 2.3866681309027014, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.495177745819092, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7300018072128296, + "num_tokens": 562238300.0, + "step": 21733 + }, + { + "epoch": 2.386777948605315, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.28045654296875, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7053502202033997, + "num_tokens": 562267018.0, + "step": 21734 + }, + { + "epoch": 2.386887766307929, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.19974946975708, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7430357933044434, + "num_tokens": 562296651.0, + "step": 21735 + }, + { + "epoch": 2.3869975840105426, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.52243709564209, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7032466530799866, + "num_tokens": 562322156.0, + "step": 21736 + }, + { + "epoch": 2.3871074017131564, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5804836750030518, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7218481302261353, + "num_tokens": 562344786.0, + "step": 21737 + }, + { + "epoch": 2.3872172194157697, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5890753269195557, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7182157039642334, + "num_tokens": 562368680.0, + "step": 21738 + }, + { + "epoch": 2.3873270371183835, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3074193000793457, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6938022971153259, + "num_tokens": 562396086.0, + "step": 21739 + }, + { + "epoch": 2.387436854820997, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2810771465301514, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7134026288986206, + "num_tokens": 562424322.0, + "step": 21740 + }, + { + "epoch": 2.387546672523611, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.425966262817383, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6885954737663269, + "num_tokens": 562449818.0, + "step": 21741 + }, + { + "epoch": 2.3876564902262243, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.339409828186035, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7127125263214111, + "num_tokens": 562479687.0, + "step": 21742 + }, + { + "epoch": 2.387766307928838, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7489683628082275, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7366408109664917, + "num_tokens": 562500708.0, + "step": 21743 + }, + { + "epoch": 2.387876125631452, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6658313274383545, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7012761831283569, + "num_tokens": 562524041.0, + "step": 21744 + }, + { + "epoch": 2.3879859433340656, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.668442487716675, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7111311554908752, + "num_tokens": 562551157.0, + "step": 21745 + }, + { + "epoch": 2.388095761036679, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1788339614868164, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7231520414352417, + "num_tokens": 562583952.0, + "step": 21746 + }, + { + "epoch": 2.3882055787392926, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7303390502929688, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7215874195098877, + "num_tokens": 562606927.0, + "step": 21747 + }, + { + "epoch": 2.3883153964419064, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.8041415214538574, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7151665091514587, + "num_tokens": 562627104.0, + "step": 21748 + }, + { + "epoch": 2.38842521414452, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7513020038604736, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7116525769233704, + "num_tokens": 562649867.0, + "step": 21749 + }, + { + "epoch": 2.388535031847134, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7167563438415527, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6945362687110901, + "num_tokens": 562672739.0, + "step": 21750 + }, + { + "epoch": 2.388644849549747, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4885733127593994, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6979425549507141, + "num_tokens": 562699666.0, + "step": 21751 + }, + { + "epoch": 2.388754667252361, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7531144618988037, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7311059832572937, + "num_tokens": 562719402.0, + "step": 21752 + }, + { + "epoch": 2.3888644849549747, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.63146710395813, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7049539685249329, + "num_tokens": 562743460.0, + "step": 21753 + }, + { + "epoch": 2.3889743026575885, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5403387546539307, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7070613503456116, + "num_tokens": 562766855.0, + "step": 21754 + }, + { + "epoch": 2.3890841203602022, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.617401361465454, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7215069532394409, + "num_tokens": 562789578.0, + "step": 21755 + }, + { + "epoch": 2.3891939380628155, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3404688835144043, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7224692106246948, + "num_tokens": 562816256.0, + "step": 21756 + }, + { + "epoch": 2.3893037557654293, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.9064886569976807, + "learning_rate": 1e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6821321845054626, + "num_tokens": 562842188.0, + "step": 21757 + }, + { + "epoch": 2.389413573468043, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.35518741607666, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7176356315612793, + "num_tokens": 562870493.0, + "step": 21758 + }, + { + "epoch": 2.389523391170657, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4543116092681885, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7167075872421265, + "num_tokens": 562896126.0, + "step": 21759 + }, + { + "epoch": 2.3896332088732706, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7929739952087402, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7309037446975708, + "num_tokens": 562916736.0, + "step": 21760 + }, + { + "epoch": 2.389743026575884, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.184652328491211, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7057594060897827, + "num_tokens": 562948370.0, + "step": 21761 + }, + { + "epoch": 2.3898528442784976, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2977051734924316, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7289141416549683, + "num_tokens": 562974389.0, + "step": 21762 + }, + { + "epoch": 2.3899626619811114, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3669207096099854, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7164016962051392, + "num_tokens": 563001419.0, + "step": 21763 + }, + { + "epoch": 2.390072479683725, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.410106897354126, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6955654621124268, + "num_tokens": 563028210.0, + "step": 21764 + }, + { + "epoch": 2.390182297386339, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.473850965499878, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7230237722396851, + "num_tokens": 563052099.0, + "step": 21765 + }, + { + "epoch": 2.3902921150889522, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7487213611602783, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.70606529712677, + "num_tokens": 563074501.0, + "step": 21766 + }, + { + "epoch": 2.390401932791566, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.168184757232666, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7135283350944519, + "num_tokens": 563105438.0, + "step": 21767 + }, + { + "epoch": 2.3905117504941797, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5298068523406982, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7185043096542358, + "num_tokens": 563127829.0, + "step": 21768 + }, + { + "epoch": 2.3906215681967935, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6036570072174072, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7181718349456787, + "num_tokens": 563150649.0, + "step": 21769 + }, + { + "epoch": 2.390731385899407, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4013736248016357, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.7010170221328735, + "num_tokens": 563174548.0, + "step": 21770 + }, + { + "epoch": 2.3908412036020206, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.0556070804595947, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6961445212364197, + "num_tokens": 563211815.0, + "step": 21771 + }, + { + "epoch": 2.3909510213046343, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2880373001098633, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6983237862586975, + "num_tokens": 563240838.0, + "step": 21772 + }, + { + "epoch": 2.391060839007248, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.44331431388855, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7161867618560791, + "num_tokens": 563266701.0, + "step": 21773 + }, + { + "epoch": 2.3911706567098614, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.518550157546997, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7383090853691101, + "num_tokens": 563290876.0, + "step": 21774 + }, + { + "epoch": 2.391280474412475, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7563223838806152, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7322746515274048, + "num_tokens": 563310841.0, + "step": 21775 + }, + { + "epoch": 2.391390292115089, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.531141519546509, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7346876859664917, + "num_tokens": 563335458.0, + "step": 21776 + }, + { + "epoch": 2.3915001098177027, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.309952974319458, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7375257015228271, + "num_tokens": 563362466.0, + "step": 21777 + }, + { + "epoch": 2.3916099275203164, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3198654651641846, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7182454466819763, + "num_tokens": 563391871.0, + "step": 21778 + }, + { + "epoch": 2.3917197452229297, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.706285238265991, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7063416242599487, + "num_tokens": 563417280.0, + "step": 21779 + }, + { + "epoch": 2.3918295629255435, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.412957191467285, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6982463598251343, + "num_tokens": 563446860.0, + "step": 21780 + }, + { + "epoch": 2.3919393806281573, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3462061882019043, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7133461833000183, + "num_tokens": 563476694.0, + "step": 21781 + }, + { + "epoch": 2.392049198330771, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.452650785446167, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7309317588806152, + "num_tokens": 563501967.0, + "step": 21782 + }, + { + "epoch": 2.3921590160333848, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.240471839904785, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7089999914169312, + "num_tokens": 563533006.0, + "step": 21783 + }, + { + "epoch": 2.392268833735998, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4297759532928467, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7389330863952637, + "num_tokens": 563556085.0, + "step": 21784 + }, + { + "epoch": 2.392378651438612, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.37355637550354, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7180016040802002, + "num_tokens": 563583818.0, + "step": 21785 + }, + { + "epoch": 2.3924884691412256, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3998358249664307, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7075402736663818, + "num_tokens": 563611700.0, + "step": 21786 + }, + { + "epoch": 2.3925982868438394, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7354249954223633, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.713910698890686, + "num_tokens": 563635091.0, + "step": 21787 + }, + { + "epoch": 2.392708104546453, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.312553882598877, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7171998023986816, + "num_tokens": 563664243.0, + "step": 21788 + }, + { + "epoch": 2.3928179222490664, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6992151737213135, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7223143577575684, + "num_tokens": 563687430.0, + "step": 21789 + }, + { + "epoch": 2.39292773995168, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.301557779312134, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.710219144821167, + "num_tokens": 563716757.0, + "step": 21790 + }, + { + "epoch": 2.393037557654294, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5480103492736816, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7340726852416992, + "num_tokens": 563739681.0, + "step": 21791 + }, + { + "epoch": 2.3931473753569077, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4938371181488037, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7250708341598511, + "num_tokens": 563763013.0, + "step": 21792 + }, + { + "epoch": 2.393257193059521, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.447453260421753, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6937513947486877, + "num_tokens": 563789203.0, + "step": 21793 + }, + { + "epoch": 2.3933670107621348, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5261483192443848, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.71548992395401, + "num_tokens": 563813546.0, + "step": 21794 + }, + { + "epoch": 2.3934768284647485, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.429579257965088, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7059193849563599, + "num_tokens": 563840301.0, + "step": 21795 + }, + { + "epoch": 2.3935866461673623, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4419443607330322, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7100796103477478, + "num_tokens": 563867125.0, + "step": 21796 + }, + { + "epoch": 2.393696463869976, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.673293352127075, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7144298553466797, + "num_tokens": 563888995.0, + "step": 21797 + }, + { + "epoch": 2.3938062815725893, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.335294723510742, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7218540906906128, + "num_tokens": 563914370.0, + "step": 21798 + }, + { + "epoch": 2.393916099275203, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.390868902206421, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7289716005325317, + "num_tokens": 563941515.0, + "step": 21799 + }, + { + "epoch": 2.394025916977817, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.308791399002075, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7332700490951538, + "num_tokens": 563967176.0, + "step": 21800 + }, + { + "epoch": 2.3941357346804306, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6419506072998047, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7295258641242981, + "num_tokens": 563989351.0, + "step": 21801 + }, + { + "epoch": 2.394245552383044, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.312241315841675, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7199769616127014, + "num_tokens": 564015982.0, + "step": 21802 + }, + { + "epoch": 2.3943553700856577, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2709238529205322, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7273164391517639, + "num_tokens": 564043171.0, + "step": 21803 + }, + { + "epoch": 2.3944651877882714, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2671380043029785, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.7057379484176636, + "num_tokens": 564073133.0, + "step": 21804 + }, + { + "epoch": 2.394575005490885, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3744282722473145, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.748208224773407, + "num_tokens": 564099305.0, + "step": 21805 + }, + { + "epoch": 2.394684823193499, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.387849807739258, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7290453910827637, + "num_tokens": 564124842.0, + "step": 21806 + }, + { + "epoch": 2.3947946408961123, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1768107414245605, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6945122480392456, + "num_tokens": 564158542.0, + "step": 21807 + }, + { + "epoch": 2.394904458598726, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3814682960510254, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7126247882843018, + "num_tokens": 564185746.0, + "step": 21808 + }, + { + "epoch": 2.39501427630134, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5397467613220215, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7204037308692932, + "num_tokens": 564208325.0, + "step": 21809 + }, + { + "epoch": 2.3951240940039535, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.317889451980591, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7131404876708984, + "num_tokens": 564237384.0, + "step": 21810 + }, + { + "epoch": 2.3952339117065673, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4296209812164307, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7027750015258789, + "num_tokens": 564263098.0, + "step": 21811 + }, + { + "epoch": 2.3953437294091806, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.501741647720337, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6970897316932678, + "num_tokens": 564287499.0, + "step": 21812 + }, + { + "epoch": 2.3954535471117944, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.791492223739624, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7186695337295532, + "num_tokens": 564307598.0, + "step": 21813 + }, + { + "epoch": 2.395563364814408, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2995197772979736, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6941187381744385, + "num_tokens": 564338375.0, + "step": 21814 + }, + { + "epoch": 2.395673182517022, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5082926750183105, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.718734622001648, + "num_tokens": 564361749.0, + "step": 21815 + }, + { + "epoch": 2.3957830002196356, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 4.473161220550537, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.707746684551239, + "num_tokens": 564387269.0, + "step": 21816 + }, + { + "epoch": 2.395892817922249, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4480485916137695, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7034972310066223, + "num_tokens": 564411993.0, + "step": 21817 + }, + { + "epoch": 2.3960026356248627, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3161606788635254, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7247123122215271, + "num_tokens": 564437303.0, + "step": 21818 + }, + { + "epoch": 2.3961124533274765, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7914791107177734, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7168205976486206, + "num_tokens": 564467197.0, + "step": 21819 + }, + { + "epoch": 2.3962222710300902, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.179112434387207, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7060143351554871, + "num_tokens": 564501039.0, + "step": 21820 + }, + { + "epoch": 2.3963320887327035, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3070900440216064, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7085193395614624, + "num_tokens": 564529656.0, + "step": 21821 + }, + { + "epoch": 2.3964419064353173, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1482133865356445, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7176353931427002, + "num_tokens": 564560411.0, + "step": 21822 + }, + { + "epoch": 2.396551724137931, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.208385705947876, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7188723087310791, + "num_tokens": 564589332.0, + "step": 21823 + }, + { + "epoch": 2.396661541840545, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.267077922821045, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7280773520469666, + "num_tokens": 564617369.0, + "step": 21824 + }, + { + "epoch": 2.396771359543158, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6471784114837646, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7092140913009644, + "num_tokens": 564640388.0, + "step": 21825 + }, + { + "epoch": 2.396881177245772, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.473785638809204, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.698043704032898, + "num_tokens": 564664886.0, + "step": 21826 + }, + { + "epoch": 2.3969909949483856, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.411018133163452, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7035930156707764, + "num_tokens": 564693525.0, + "step": 21827 + }, + { + "epoch": 2.3971008126509994, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.505350112915039, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.708621621131897, + "num_tokens": 564721609.0, + "step": 21828 + }, + { + "epoch": 2.397210630353613, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4532694816589355, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7154617309570312, + "num_tokens": 564747590.0, + "step": 21829 + }, + { + "epoch": 2.3973204480562265, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.2092719078063965, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6843984127044678, + "num_tokens": 564778965.0, + "step": 21830 + }, + { + "epoch": 2.39743026575884, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6031055450439453, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.726076066493988, + "num_tokens": 564800943.0, + "step": 21831 + }, + { + "epoch": 2.397540083461454, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5220532417297363, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.695106029510498, + "num_tokens": 564826490.0, + "step": 21832 + }, + { + "epoch": 2.3976499011640677, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5091946125030518, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7022292613983154, + "num_tokens": 564852336.0, + "step": 21833 + }, + { + "epoch": 2.3977597188666815, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4598371982574463, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7232340574264526, + "num_tokens": 564876595.0, + "step": 21834 + }, + { + "epoch": 2.397869536569295, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5669212341308594, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7119187116622925, + "num_tokens": 564906207.0, + "step": 21835 + }, + { + "epoch": 2.3979793542719086, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.894225597381592, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7278080582618713, + "num_tokens": 564925607.0, + "step": 21836 + }, + { + "epoch": 2.3980891719745223, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.5930919647216797, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.731924831867218, + "num_tokens": 564948981.0, + "step": 21837 + }, + { + "epoch": 2.398198989677136, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.30076265335083, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7016206979751587, + "num_tokens": 564980853.0, + "step": 21838 + }, + { + "epoch": 2.39830880737975, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.239064931869507, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6950923204421997, + "num_tokens": 565010521.0, + "step": 21839 + }, + { + "epoch": 2.398418625082363, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.7941641807556152, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7378764152526855, + "num_tokens": 565030037.0, + "step": 21840 + }, + { + "epoch": 2.398528442784977, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.9074838161468506, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7092573046684265, + "num_tokens": 565056268.0, + "step": 21841 + }, + { + "epoch": 2.3986382604875907, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.42905330657959, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.7005749344825745, + "num_tokens": 565082378.0, + "step": 21842 + }, + { + "epoch": 2.3987480781902044, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.999211311340332, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7295315861701965, + "num_tokens": 565100237.0, + "step": 21843 + }, + { + "epoch": 2.398857895892818, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.451929807662964, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6999524831771851, + "num_tokens": 565125361.0, + "step": 21844 + }, + { + "epoch": 2.3989677135954315, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.6896414756774902, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7435476779937744, + "num_tokens": 565146362.0, + "step": 21845 + }, + { + "epoch": 2.3990775312980452, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.666534185409546, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7333624362945557, + "num_tokens": 565168609.0, + "step": 21846 + }, + { + "epoch": 2.399187349000659, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.583918333053589, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7060816287994385, + "num_tokens": 565192456.0, + "step": 21847 + }, + { + "epoch": 2.3992971667032728, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.325669765472412, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6999400854110718, + "num_tokens": 565221728.0, + "step": 21848 + }, + { + "epoch": 2.399406984405886, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1940202713012695, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7241572141647339, + "num_tokens": 565253334.0, + "step": 21849 + }, + { + "epoch": 2.3995168021085, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3905110359191895, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.702049195766449, + "num_tokens": 565280534.0, + "step": 21850 + }, + { + "epoch": 2.3996266198111136, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4116578102111816, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7137137651443481, + "num_tokens": 565307008.0, + "step": 21851 + }, + { + "epoch": 2.3997364375137273, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6146750450134277, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7226213216781616, + "num_tokens": 565329287.0, + "step": 21852 + }, + { + "epoch": 2.3998462552163407, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2564666271209717, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7280289530754089, + "num_tokens": 565358958.0, + "step": 21853 + }, + { + "epoch": 2.3999560729189544, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3158059120178223, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7307850122451782, + "num_tokens": 565388515.0, + "step": 21854 + }, + { + "epoch": 2.400065890621568, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6272835731506348, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7159432172775269, + "num_tokens": 565411671.0, + "step": 21855 + }, + { + "epoch": 2.400175708324182, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5140738487243652, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7300055623054504, + "num_tokens": 565434986.0, + "step": 21856 + }, + { + "epoch": 2.4002855260267957, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 3.841049909591675, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7096267938613892, + "num_tokens": 565460261.0, + "step": 21857 + }, + { + "epoch": 2.400395343729409, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3289456367492676, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7029620409011841, + "num_tokens": 565489292.0, + "step": 21858 + }, + { + "epoch": 2.4005051614320227, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3947246074676514, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7408401966094971, + "num_tokens": 565512923.0, + "step": 21859 + }, + { + "epoch": 2.4006149791346365, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.328828811645508, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.6847431659698486, + "num_tokens": 565542048.0, + "step": 21860 + }, + { + "epoch": 2.4007247968372503, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2527811527252197, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7012784481048584, + "num_tokens": 565571858.0, + "step": 21861 + }, + { + "epoch": 2.400834614539864, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4593863487243652, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.7021239995956421, + "num_tokens": 565597292.0, + "step": 21862 + }, + { + "epoch": 2.4009444322424773, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3297529220581055, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7232563495635986, + "num_tokens": 565622850.0, + "step": 21863 + }, + { + "epoch": 2.401054249945091, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5596706867218018, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7069860696792603, + "num_tokens": 565645293.0, + "step": 21864 + }, + { + "epoch": 2.401164067647705, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3173580169677734, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7541429996490479, + "num_tokens": 565673669.0, + "step": 21865 + }, + { + "epoch": 2.4012738853503186, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2210867404937744, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6919781565666199, + "num_tokens": 565703567.0, + "step": 21866 + }, + { + "epoch": 2.4013837030529324, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.3995063304901123, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7040497660636902, + "num_tokens": 565728942.0, + "step": 21867 + }, + { + "epoch": 2.4014935207555457, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.1875617504119873, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7160080671310425, + "num_tokens": 565760420.0, + "step": 21868 + }, + { + "epoch": 2.4016033384581594, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.582608461380005, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7254419326782227, + "num_tokens": 565782604.0, + "step": 21869 + }, + { + "epoch": 2.401713156160773, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.696802854537964, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7065750360488892, + "num_tokens": 565803210.0, + "step": 21870 + }, + { + "epoch": 2.401822973863387, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2923059463500977, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7115797996520996, + "num_tokens": 565830695.0, + "step": 21871 + }, + { + "epoch": 2.4019327915660003, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.4307992458343506, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7118760347366333, + "num_tokens": 565855945.0, + "step": 21872 + }, + { + "epoch": 2.402042609268614, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8544225692749023, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7281099557876587, + "num_tokens": 565875323.0, + "step": 21873 + }, + { + "epoch": 2.4021524269712278, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3120594024658203, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.689690113067627, + "num_tokens": 565904005.0, + "step": 21874 + }, + { + "epoch": 2.4022622446738415, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2632057666778564, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7057040929794312, + "num_tokens": 565933325.0, + "step": 21875 + }, + { + "epoch": 2.402372062376455, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.834064245223999, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7156875729560852, + "num_tokens": 565952176.0, + "step": 21876 + }, + { + "epoch": 2.4024818800790686, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.716125726699829, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7261868715286255, + "num_tokens": 565975579.0, + "step": 21877 + }, + { + "epoch": 2.4025916977816824, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3503496646881104, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.7015354633331299, + "num_tokens": 566003823.0, + "step": 21878 + }, + { + "epoch": 2.402701515484296, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4409570693969727, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7211835384368896, + "num_tokens": 566031231.0, + "step": 21879 + }, + { + "epoch": 2.40281133318691, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7044837474823, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7173122763633728, + "num_tokens": 566053359.0, + "step": 21880 + }, + { + "epoch": 2.402921150889523, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.356691360473633, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7235117554664612, + "num_tokens": 566081184.0, + "step": 21881 + }, + { + "epoch": 2.403030968592137, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.561781167984009, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7048963308334351, + "num_tokens": 566107003.0, + "step": 21882 + }, + { + "epoch": 2.4031407862947507, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4108057022094727, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7063544988632202, + "num_tokens": 566131996.0, + "step": 21883 + }, + { + "epoch": 2.4032506039973645, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3515050411224365, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7240297198295593, + "num_tokens": 566156993.0, + "step": 21884 + }, + { + "epoch": 2.403360421699978, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3818044662475586, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.7012778520584106, + "num_tokens": 566181627.0, + "step": 21885 + }, + { + "epoch": 2.4034702394025915, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4894094467163086, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7101560831069946, + "num_tokens": 566207643.0, + "step": 21886 + }, + { + "epoch": 2.4035800571052053, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.492279529571533, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7106204628944397, + "num_tokens": 566232623.0, + "step": 21887 + }, + { + "epoch": 2.403689874807819, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4693567752838135, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7047242522239685, + "num_tokens": 566259132.0, + "step": 21888 + }, + { + "epoch": 2.403799692510433, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.581275463104248, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7333565950393677, + "num_tokens": 566283076.0, + "step": 21889 + }, + { + "epoch": 2.4039095102130466, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.509632110595703, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7250989079475403, + "num_tokens": 566307400.0, + "step": 21890 + }, + { + "epoch": 2.40401932791566, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.923661470413208, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7215021252632141, + "num_tokens": 566326041.0, + "step": 21891 + }, + { + "epoch": 2.4041291456182736, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3978939056396484, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7151581048965454, + "num_tokens": 566355322.0, + "step": 21892 + }, + { + "epoch": 2.4042389633208874, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4196691513061523, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7142280340194702, + "num_tokens": 566381855.0, + "step": 21893 + }, + { + "epoch": 2.404348781023501, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.40401291847229, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7297331094741821, + "num_tokens": 566405157.0, + "step": 21894 + }, + { + "epoch": 2.404458598726115, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.334754228591919, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7368307113647461, + "num_tokens": 566432532.0, + "step": 21895 + }, + { + "epoch": 2.404568416428728, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.865741014480591, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7289270162582397, + "num_tokens": 566452057.0, + "step": 21896 + }, + { + "epoch": 2.404678234131342, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.822544574737549, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7257814407348633, + "num_tokens": 566472312.0, + "step": 21897 + }, + { + "epoch": 2.4047880518339557, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.461817741394043, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7210826277732849, + "num_tokens": 566498063.0, + "step": 21898 + }, + { + "epoch": 2.4048978695365695, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5899171829223633, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7180445194244385, + "num_tokens": 566520570.0, + "step": 21899 + }, + { + "epoch": 2.405007687239183, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.256226062774658, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7125183343887329, + "num_tokens": 566547892.0, + "step": 21900 + }, + { + "epoch": 2.4051175049417965, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6001155376434326, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.724726676940918, + "num_tokens": 566569457.0, + "step": 21901 + }, + { + "epoch": 2.4052273226444103, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1627352237701416, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7091848850250244, + "num_tokens": 566601355.0, + "step": 21902 + }, + { + "epoch": 2.405337140347024, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6433095932006836, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7300045490264893, + "num_tokens": 566622944.0, + "step": 21903 + }, + { + "epoch": 2.4054469580496374, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.404642105102539, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.7041694521903992, + "num_tokens": 566649726.0, + "step": 21904 + }, + { + "epoch": 2.405556775752251, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.46669864654541, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6995812654495239, + "num_tokens": 566676527.0, + "step": 21905 + }, + { + "epoch": 2.405666593454865, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.318817377090454, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7204840183258057, + "num_tokens": 566703545.0, + "step": 21906 + }, + { + "epoch": 2.4057764111574786, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.541123867034912, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7328381538391113, + "num_tokens": 566730234.0, + "step": 21907 + }, + { + "epoch": 2.4058862288600924, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3956847190856934, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7151533365249634, + "num_tokens": 566755985.0, + "step": 21908 + }, + { + "epoch": 2.4059960465627057, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.523822069168091, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7097097635269165, + "num_tokens": 566780924.0, + "step": 21909 + }, + { + "epoch": 2.4061058642653195, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5065574645996094, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7280576229095459, + "num_tokens": 566805480.0, + "step": 21910 + }, + { + "epoch": 2.4062156819679332, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.544415235519409, + "learning_rate": 1e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6811790466308594, + "num_tokens": 566832750.0, + "step": 21911 + }, + { + "epoch": 2.406325499670547, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.621535539627075, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7262783050537109, + "num_tokens": 566855160.0, + "step": 21912 + }, + { + "epoch": 2.4064353173731607, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.637402296066284, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7260527014732361, + "num_tokens": 566876672.0, + "step": 21913 + }, + { + "epoch": 2.406545135075774, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.339735269546509, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6992616653442383, + "num_tokens": 566906330.0, + "step": 21914 + }, + { + "epoch": 2.406654952778388, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5244803428649902, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6986003518104553, + "num_tokens": 566930922.0, + "step": 21915 + }, + { + "epoch": 2.4067647704810016, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3846218585968018, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7309238314628601, + "num_tokens": 566957596.0, + "step": 21916 + }, + { + "epoch": 2.4068745881836153, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5282626152038574, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7108170390129089, + "num_tokens": 566982957.0, + "step": 21917 + }, + { + "epoch": 2.406984405886229, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.48842191696167, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7149263024330139, + "num_tokens": 567006482.0, + "step": 21918 + }, + { + "epoch": 2.4070942235888424, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.461012601852417, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7207793593406677, + "num_tokens": 567032029.0, + "step": 21919 + }, + { + "epoch": 2.407204041291456, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.47198748588562, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7169933319091797, + "num_tokens": 567057629.0, + "step": 21920 + }, + { + "epoch": 2.40731385899407, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3579745292663574, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7100690603256226, + "num_tokens": 567085534.0, + "step": 21921 + }, + { + "epoch": 2.4074236766966837, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.200509786605835, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7020642757415771, + "num_tokens": 567118239.0, + "step": 21922 + }, + { + "epoch": 2.407533494399297, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.567884683609009, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7287322282791138, + "num_tokens": 567141682.0, + "step": 21923 + }, + { + "epoch": 2.4076433121019107, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1496694087982178, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7130963802337646, + "num_tokens": 567174548.0, + "step": 21924 + }, + { + "epoch": 2.4077531298045245, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6046557426452637, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7290041446685791, + "num_tokens": 567197996.0, + "step": 21925 + }, + { + "epoch": 2.4078629475071383, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1773788928985596, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.705774188041687, + "num_tokens": 567231224.0, + "step": 21926 + }, + { + "epoch": 2.4079727652097516, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.493537425994873, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7095547914505005, + "num_tokens": 567257919.0, + "step": 21927 + }, + { + "epoch": 2.4080825829123653, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.823617458343506, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7080720663070679, + "num_tokens": 567283142.0, + "step": 21928 + }, + { + "epoch": 2.408192400614979, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4578146934509277, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.697868824005127, + "num_tokens": 567308445.0, + "step": 21929 + }, + { + "epoch": 2.408302218317593, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.299973487854004, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7154984474182129, + "num_tokens": 567337026.0, + "step": 21930 + }, + { + "epoch": 2.4084120360202066, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5486502647399902, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6951246857643127, + "num_tokens": 567362751.0, + "step": 21931 + }, + { + "epoch": 2.40852185372282, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.417088747024536, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7211126685142517, + "num_tokens": 567389259.0, + "step": 21932 + }, + { + "epoch": 2.4086316714254337, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.577106475830078, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.719700813293457, + "num_tokens": 567413614.0, + "step": 21933 + }, + { + "epoch": 2.4087414891280474, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.156850814819336, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7166672348976135, + "num_tokens": 567449268.0, + "step": 21934 + }, + { + "epoch": 2.408851306830661, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.779663324356079, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7329442501068115, + "num_tokens": 567469816.0, + "step": 21935 + }, + { + "epoch": 2.408961124533275, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3919332027435303, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.706058919429779, + "num_tokens": 567495710.0, + "step": 21936 + }, + { + "epoch": 2.4090709422358882, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6074650287628174, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.726624608039856, + "num_tokens": 567518594.0, + "step": 21937 + }, + { + "epoch": 2.409180759938502, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5150744915008545, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7176343202590942, + "num_tokens": 567545070.0, + "step": 21938 + }, + { + "epoch": 2.4092905776411158, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3574178218841553, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7229818105697632, + "num_tokens": 567572442.0, + "step": 21939 + }, + { + "epoch": 2.4094003953437295, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6066813468933105, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7056896686553955, + "num_tokens": 567597535.0, + "step": 21940 + }, + { + "epoch": 2.4095102130463433, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4318742752075195, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6882123947143555, + "num_tokens": 567625135.0, + "step": 21941 + }, + { + "epoch": 2.4096200307489566, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.551786184310913, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7303988933563232, + "num_tokens": 567647853.0, + "step": 21942 + }, + { + "epoch": 2.4097298484515703, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2646005153656006, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.7030884027481079, + "num_tokens": 567676905.0, + "step": 21943 + }, + { + "epoch": 2.409839666154184, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 32.603370666503906, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7352303862571716, + "num_tokens": 567698642.0, + "step": 21944 + }, + { + "epoch": 2.409949483856798, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3176980018615723, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7237681150436401, + "num_tokens": 567729030.0, + "step": 21945 + }, + { + "epoch": 2.4100593015594116, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8015635013580322, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.739365816116333, + "num_tokens": 567747945.0, + "step": 21946 + }, + { + "epoch": 2.410169119262025, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5161707401275635, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7137939929962158, + "num_tokens": 567773337.0, + "step": 21947 + }, + { + "epoch": 2.4102789369646387, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.558868408203125, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6998297572135925, + "num_tokens": 567798204.0, + "step": 21948 + }, + { + "epoch": 2.4103887546672524, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.56242036819458, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7069766521453857, + "num_tokens": 567824427.0, + "step": 21949 + }, + { + "epoch": 2.410498572369866, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2098095417022705, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.737091600894928, + "num_tokens": 567853385.0, + "step": 21950 + }, + { + "epoch": 2.4106083900724795, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8615827560424805, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7182689905166626, + "num_tokens": 567874455.0, + "step": 21951 + }, + { + "epoch": 2.4107182077750933, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2338216304779053, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7157282829284668, + "num_tokens": 567904011.0, + "step": 21952 + }, + { + "epoch": 2.410828025477707, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.396390438079834, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6975934505462646, + "num_tokens": 567929546.0, + "step": 21953 + }, + { + "epoch": 2.410937843180321, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.357802152633667, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7505339980125427, + "num_tokens": 567955058.0, + "step": 21954 + }, + { + "epoch": 2.411047660882934, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.373321533203125, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7304195165634155, + "num_tokens": 567982081.0, + "step": 21955 + }, + { + "epoch": 2.411157478585548, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4636011123657227, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7167456746101379, + "num_tokens": 568007640.0, + "step": 21956 + }, + { + "epoch": 2.4112672962881616, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.557248592376709, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7096461057662964, + "num_tokens": 568031630.0, + "step": 21957 + }, + { + "epoch": 2.4113771139907754, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5271637439727783, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7167772054672241, + "num_tokens": 568054579.0, + "step": 21958 + }, + { + "epoch": 2.411486931693389, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6214070320129395, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7314954400062561, + "num_tokens": 568078160.0, + "step": 21959 + }, + { + "epoch": 2.4115967493960024, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.453674554824829, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.705470085144043, + "num_tokens": 568102673.0, + "step": 21960 + }, + { + "epoch": 2.411706567098616, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3433189392089844, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7173480987548828, + "num_tokens": 568129707.0, + "step": 21961 + }, + { + "epoch": 2.41181638480123, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2946808338165283, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7184141874313354, + "num_tokens": 568159400.0, + "step": 21962 + }, + { + "epoch": 2.4119262025038437, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.366631507873535, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7085979580879211, + "num_tokens": 568184122.0, + "step": 21963 + }, + { + "epoch": 2.4120360202064575, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6111295223236084, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7024478912353516, + "num_tokens": 568207931.0, + "step": 21964 + }, + { + "epoch": 2.412145837909071, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2176737785339355, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7160393595695496, + "num_tokens": 568239815.0, + "step": 21965 + }, + { + "epoch": 2.4122556556116845, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1050992012023926, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6985558271408081, + "num_tokens": 568277237.0, + "step": 21966 + }, + { + "epoch": 2.4123654733142983, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.137608528137207, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7033875584602356, + "num_tokens": 568311920.0, + "step": 21967 + }, + { + "epoch": 2.412475291016912, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7770731449127197, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7569049596786499, + "num_tokens": 568330760.0, + "step": 21968 + }, + { + "epoch": 2.412585108719526, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5401298999786377, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6986542344093323, + "num_tokens": 568355507.0, + "step": 21969 + }, + { + "epoch": 2.412694926422139, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.536177635192871, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7048258185386658, + "num_tokens": 568382024.0, + "step": 21970 + }, + { + "epoch": 2.412804744124753, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.620591163635254, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7121345400810242, + "num_tokens": 568405147.0, + "step": 21971 + }, + { + "epoch": 2.4129145618273666, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.275867223739624, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7083818912506104, + "num_tokens": 568433274.0, + "step": 21972 + }, + { + "epoch": 2.4130243795299804, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.605417251586914, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.716217041015625, + "num_tokens": 568457178.0, + "step": 21973 + }, + { + "epoch": 2.4131341972325937, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1974546909332275, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.731861412525177, + "num_tokens": 568487821.0, + "step": 21974 + }, + { + "epoch": 2.4132440149352075, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5453505516052246, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7088006734848022, + "num_tokens": 568511891.0, + "step": 21975 + }, + { + "epoch": 2.413353832637821, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7608399391174316, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7106040120124817, + "num_tokens": 568533065.0, + "step": 21976 + }, + { + "epoch": 2.413463650340435, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2602579593658447, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6927288174629211, + "num_tokens": 568562977.0, + "step": 21977 + }, + { + "epoch": 2.4135734680430487, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.394829273223877, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7042574882507324, + "num_tokens": 568588649.0, + "step": 21978 + }, + { + "epoch": 2.413683285745662, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.831716537475586, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.716522753238678, + "num_tokens": 568609408.0, + "step": 21979 + }, + { + "epoch": 2.413793103448276, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.398714065551758, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6977117657661438, + "num_tokens": 568635159.0, + "step": 21980 + }, + { + "epoch": 2.4139029211508896, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.386845588684082, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6946777105331421, + "num_tokens": 568662805.0, + "step": 21981 + }, + { + "epoch": 2.4140127388535033, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6358039379119873, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7234760522842407, + "num_tokens": 568686176.0, + "step": 21982 + }, + { + "epoch": 2.4141225565561166, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8288307189941406, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7037684917449951, + "num_tokens": 568706853.0, + "step": 21983 + }, + { + "epoch": 2.4142323742587304, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.236814260482788, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7031770944595337, + "num_tokens": 568736656.0, + "step": 21984 + }, + { + "epoch": 2.414342191961344, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6071617603302, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7454174160957336, + "num_tokens": 568758166.0, + "step": 21985 + }, + { + "epoch": 2.414452009663958, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6184892654418945, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7234932780265808, + "num_tokens": 568778611.0, + "step": 21986 + }, + { + "epoch": 2.4145618273665717, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.790679454803467, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7202487587928772, + "num_tokens": 568799349.0, + "step": 21987 + }, + { + "epoch": 2.414671645069185, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.240168809890747, + "learning_rate": 1e-06, + "loss": 1.0874, + "mean_token_accuracy": 0.6983956098556519, + "num_tokens": 568830941.0, + "step": 21988 + }, + { + "epoch": 2.4147814627717987, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1731700897216797, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6988170742988586, + "num_tokens": 568861571.0, + "step": 21989 + }, + { + "epoch": 2.4148912804744125, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.338275909423828, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7202879190444946, + "num_tokens": 568889511.0, + "step": 21990 + }, + { + "epoch": 2.4150010981770262, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.342440605163574, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7273657917976379, + "num_tokens": 568916393.0, + "step": 21991 + }, + { + "epoch": 2.41511091587964, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.317815065383911, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7064319849014282, + "num_tokens": 568943616.0, + "step": 21992 + }, + { + "epoch": 2.4152207335822533, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.659044027328491, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7240680456161499, + "num_tokens": 568966047.0, + "step": 21993 + }, + { + "epoch": 2.415330551284867, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7172462940216064, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7408398985862732, + "num_tokens": 568987775.0, + "step": 21994 + }, + { + "epoch": 2.415440368987481, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.299874782562256, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7140209674835205, + "num_tokens": 569017181.0, + "step": 21995 + }, + { + "epoch": 2.4155501866900946, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.340886354446411, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7315018177032471, + "num_tokens": 569043843.0, + "step": 21996 + }, + { + "epoch": 2.4156600043927083, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4795401096343994, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.71140456199646, + "num_tokens": 569070037.0, + "step": 21997 + }, + { + "epoch": 2.4157698220953217, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8346292972564697, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7240689992904663, + "num_tokens": 569090111.0, + "step": 21998 + }, + { + "epoch": 2.4158796397979354, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 3.113727569580078, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6970418691635132, + "num_tokens": 569108485.0, + "step": 21999 + }, + { + "epoch": 2.415989457500549, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4731531143188477, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6984454393386841, + "num_tokens": 569134939.0, + "step": 22000 + }, + { + "epoch": 2.416099275203163, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2939164638519287, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7222450971603394, + "num_tokens": 569163020.0, + "step": 22001 + }, + { + "epoch": 2.4162090929057762, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4771666526794434, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7136051654815674, + "num_tokens": 569188176.0, + "step": 22002 + }, + { + "epoch": 2.41631891060839, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2534596920013428, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.727792501449585, + "num_tokens": 569216999.0, + "step": 22003 + }, + { + "epoch": 2.4164287283110037, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.166607618331909, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7183537483215332, + "num_tokens": 569247007.0, + "step": 22004 + }, + { + "epoch": 2.4165385460136175, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8531813621520996, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7296434640884399, + "num_tokens": 569264811.0, + "step": 22005 + }, + { + "epoch": 2.416648363716231, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.651157855987549, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.7074120044708252, + "num_tokens": 569289177.0, + "step": 22006 + }, + { + "epoch": 2.4167581814188446, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2064194679260254, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7015944719314575, + "num_tokens": 569320467.0, + "step": 22007 + }, + { + "epoch": 2.4168679991214583, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5920469760894775, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.713447630405426, + "num_tokens": 569344687.0, + "step": 22008 + }, + { + "epoch": 2.416977816824072, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1843576431274414, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7086182832717896, + "num_tokens": 569376539.0, + "step": 22009 + }, + { + "epoch": 2.417087634526686, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.464679718017578, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7358331680297852, + "num_tokens": 569399611.0, + "step": 22010 + }, + { + "epoch": 2.417197452229299, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4686336517333984, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.692818820476532, + "num_tokens": 569425410.0, + "step": 22011 + }, + { + "epoch": 2.417307269931913, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 3.199227809906006, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7224461436271667, + "num_tokens": 569447821.0, + "step": 22012 + }, + { + "epoch": 2.4174170876345267, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6182944774627686, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7065232396125793, + "num_tokens": 569471760.0, + "step": 22013 + }, + { + "epoch": 2.4175269053371404, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.611386299133301, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.72037672996521, + "num_tokens": 569495548.0, + "step": 22014 + }, + { + "epoch": 2.417636723039754, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6075024604797363, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7114080190658569, + "num_tokens": 569518643.0, + "step": 22015 + }, + { + "epoch": 2.4177465407423675, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8963394165039062, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7208056449890137, + "num_tokens": 569538571.0, + "step": 22016 + }, + { + "epoch": 2.4178563584449813, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.251948356628418, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7046014070510864, + "num_tokens": 569569174.0, + "step": 22017 + }, + { + "epoch": 2.417966176147595, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4487407207489014, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6997765302658081, + "num_tokens": 569595496.0, + "step": 22018 + }, + { + "epoch": 2.4180759938502088, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1517319679260254, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.694953203201294, + "num_tokens": 569629327.0, + "step": 22019 + }, + { + "epoch": 2.4181858115528225, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3357107639312744, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7382423281669617, + "num_tokens": 569654413.0, + "step": 22020 + }, + { + "epoch": 2.418295629255436, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.809069871902466, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7344534397125244, + "num_tokens": 569676924.0, + "step": 22021 + }, + { + "epoch": 2.4184054469580496, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2644197940826416, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7147307991981506, + "num_tokens": 569706225.0, + "step": 22022 + }, + { + "epoch": 2.4185152646606634, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.522833824157715, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7196676135063171, + "num_tokens": 569729987.0, + "step": 22023 + }, + { + "epoch": 2.418625082363277, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6485836505889893, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.7016206979751587, + "num_tokens": 569751708.0, + "step": 22024 + }, + { + "epoch": 2.418734900065891, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5478503704071045, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7157626748085022, + "num_tokens": 569775611.0, + "step": 22025 + }, + { + "epoch": 2.418844717768504, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.11810564994812, + "learning_rate": 1e-06, + "loss": 1.1542, + "mean_token_accuracy": 0.6815299987792969, + "num_tokens": 569811089.0, + "step": 22026 + }, + { + "epoch": 2.418954535471118, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3770556449890137, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7131885290145874, + "num_tokens": 569836572.0, + "step": 22027 + }, + { + "epoch": 2.4190643531737317, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8125510215759277, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.725631833076477, + "num_tokens": 569856953.0, + "step": 22028 + }, + { + "epoch": 2.4191741708763455, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.417724609375, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7324704527854919, + "num_tokens": 569883376.0, + "step": 22029 + }, + { + "epoch": 2.4192839885789588, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.508133888244629, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7115904092788696, + "num_tokens": 569907999.0, + "step": 22030 + }, + { + "epoch": 2.4193938062815725, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2824039459228516, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7111425995826721, + "num_tokens": 569938111.0, + "step": 22031 + }, + { + "epoch": 2.4195036239841863, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3238472938537598, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.700767993927002, + "num_tokens": 569966184.0, + "step": 22032 + }, + { + "epoch": 2.4196134416868, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8774197101593018, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.749707043170929, + "num_tokens": 569985005.0, + "step": 22033 + }, + { + "epoch": 2.4197232593894134, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.48816180229187, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7073070406913757, + "num_tokens": 570010300.0, + "step": 22034 + }, + { + "epoch": 2.419833077092027, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5693986415863037, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7251067757606506, + "num_tokens": 570032860.0, + "step": 22035 + }, + { + "epoch": 2.419942894794641, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3527393341064453, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7318867444992065, + "num_tokens": 570061059.0, + "step": 22036 + }, + { + "epoch": 2.4200527124972546, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4458727836608887, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7260385751724243, + "num_tokens": 570087969.0, + "step": 22037 + }, + { + "epoch": 2.4201625301998684, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.442812919616699, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6943768262863159, + "num_tokens": 570115084.0, + "step": 22038 + }, + { + "epoch": 2.4202723479024817, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2333741188049316, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7128181457519531, + "num_tokens": 570143869.0, + "step": 22039 + }, + { + "epoch": 2.4203821656050954, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.245076894760132, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7112366557121277, + "num_tokens": 570173988.0, + "step": 22040 + }, + { + "epoch": 2.420491983307709, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3993566036224365, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.698053240776062, + "num_tokens": 570202500.0, + "step": 22041 + }, + { + "epoch": 2.420601801010323, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.644690990447998, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7163587808609009, + "num_tokens": 570226143.0, + "step": 22042 + }, + { + "epoch": 2.4207116187129367, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.693511962890625, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7283507585525513, + "num_tokens": 570246966.0, + "step": 22043 + }, + { + "epoch": 2.42082143641555, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2278523445129395, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7331023812294006, + "num_tokens": 570276122.0, + "step": 22044 + }, + { + "epoch": 2.420931254118164, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.54154372215271, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7086681127548218, + "num_tokens": 570299470.0, + "step": 22045 + }, + { + "epoch": 2.4210410718207775, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4431350231170654, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7155240774154663, + "num_tokens": 570326235.0, + "step": 22046 + }, + { + "epoch": 2.4211508895233913, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6688766479492188, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7270848155021667, + "num_tokens": 570346722.0, + "step": 22047 + }, + { + "epoch": 2.421260707226005, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8044240474700928, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.7012240290641785, + "num_tokens": 570368090.0, + "step": 22048 + }, + { + "epoch": 2.4213705249286184, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.636415958404541, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7177414894104004, + "num_tokens": 570391613.0, + "step": 22049 + }, + { + "epoch": 2.421480342631232, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.770463705062866, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7321524620056152, + "num_tokens": 570411032.0, + "step": 22050 + }, + { + "epoch": 2.421590160333846, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.958174228668213, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7063130140304565, + "num_tokens": 570435697.0, + "step": 22051 + }, + { + "epoch": 2.4216999780364596, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7507383823394775, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7065757513046265, + "num_tokens": 570457632.0, + "step": 22052 + }, + { + "epoch": 2.421809795739073, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5014376640319824, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7370002269744873, + "num_tokens": 570482577.0, + "step": 22053 + }, + { + "epoch": 2.4219196134416867, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2507271766662598, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7048797607421875, + "num_tokens": 570515113.0, + "step": 22054 + }, + { + "epoch": 2.4220294311443005, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2120306491851807, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7058398723602295, + "num_tokens": 570546724.0, + "step": 22055 + }, + { + "epoch": 2.4221392488469142, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.3956120014190674, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7190699577331543, + "num_tokens": 570572622.0, + "step": 22056 + }, + { + "epoch": 2.4222490665495275, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.702504873275757, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7407057285308838, + "num_tokens": 570592893.0, + "step": 22057 + }, + { + "epoch": 2.4223588842521413, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5177109241485596, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7085812091827393, + "num_tokens": 570617906.0, + "step": 22058 + }, + { + "epoch": 2.422468701954755, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6881635189056396, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7145260572433472, + "num_tokens": 570640269.0, + "step": 22059 + }, + { + "epoch": 2.422578519657369, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.890446901321411, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7279466986656189, + "num_tokens": 570661526.0, + "step": 22060 + }, + { + "epoch": 2.4226883373599826, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.275089740753174, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7016103267669678, + "num_tokens": 570692755.0, + "step": 22061 + }, + { + "epoch": 2.422798155062596, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.738973379135132, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7207953929901123, + "num_tokens": 570715163.0, + "step": 22062 + }, + { + "epoch": 2.4229079727652096, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1987290382385254, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6978856921195984, + "num_tokens": 570746436.0, + "step": 22063 + }, + { + "epoch": 2.4230177904678234, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.841085433959961, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7045414447784424, + "num_tokens": 570768185.0, + "step": 22064 + }, + { + "epoch": 2.423127608170437, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6267476081848145, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7157655358314514, + "num_tokens": 570793782.0, + "step": 22065 + }, + { + "epoch": 2.423237425873051, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4022481441497803, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7343379259109497, + "num_tokens": 570817523.0, + "step": 22066 + }, + { + "epoch": 2.4233472435756642, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.307796001434326, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7101742029190063, + "num_tokens": 570847349.0, + "step": 22067 + }, + { + "epoch": 2.423457061278278, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3579680919647217, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7290847301483154, + "num_tokens": 570874899.0, + "step": 22068 + }, + { + "epoch": 2.4235668789808917, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4818191528320312, + "learning_rate": 1e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7593432664871216, + "num_tokens": 570898312.0, + "step": 22069 + }, + { + "epoch": 2.4236766966835055, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.475963592529297, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7120824456214905, + "num_tokens": 570923526.0, + "step": 22070 + }, + { + "epoch": 2.4237865143861193, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1581032276153564, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7167003154754639, + "num_tokens": 570955857.0, + "step": 22071 + }, + { + "epoch": 2.4238963320887326, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.320117473602295, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7305359840393066, + "num_tokens": 570982408.0, + "step": 22072 + }, + { + "epoch": 2.4240061497913463, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.337998390197754, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7123238444328308, + "num_tokens": 571012542.0, + "step": 22073 + }, + { + "epoch": 2.42411596749396, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3543882369995117, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7002314925193787, + "num_tokens": 571038782.0, + "step": 22074 + }, + { + "epoch": 2.424225785196574, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.145843029022217, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.730940043926239, + "num_tokens": 571071042.0, + "step": 22075 + }, + { + "epoch": 2.4243356028991876, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.9732344150543213, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7152226567268372, + "num_tokens": 571097481.0, + "step": 22076 + }, + { + "epoch": 2.424445420601801, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.634206533432007, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7171958088874817, + "num_tokens": 571123289.0, + "step": 22077 + }, + { + "epoch": 2.4245552383044147, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4319188594818115, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.718168318271637, + "num_tokens": 571150458.0, + "step": 22078 + }, + { + "epoch": 2.4246650560070284, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5825390815734863, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.733537495136261, + "num_tokens": 571174379.0, + "step": 22079 + }, + { + "epoch": 2.424774873709642, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.313523292541504, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7167657613754272, + "num_tokens": 571203623.0, + "step": 22080 + }, + { + "epoch": 2.4248846914122555, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.182523250579834, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6925609111785889, + "num_tokens": 571236688.0, + "step": 22081 + }, + { + "epoch": 2.4249945091148692, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2456536293029785, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7074590921401978, + "num_tokens": 571266124.0, + "step": 22082 + }, + { + "epoch": 2.425104326817483, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6471357345581055, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7041290402412415, + "num_tokens": 571289727.0, + "step": 22083 + }, + { + "epoch": 2.4252141445200968, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3078017234802246, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7195829749107361, + "num_tokens": 571317444.0, + "step": 22084 + }, + { + "epoch": 2.42532396222271, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.407116413116455, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7061941623687744, + "num_tokens": 571346632.0, + "step": 22085 + }, + { + "epoch": 2.425433779925324, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.397141218185425, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7086389064788818, + "num_tokens": 571374381.0, + "step": 22086 + }, + { + "epoch": 2.4255435976279376, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.427924633026123, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.704397439956665, + "num_tokens": 571400742.0, + "step": 22087 + }, + { + "epoch": 2.4256534153305513, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.372859477996826, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.732032299041748, + "num_tokens": 571427455.0, + "step": 22088 + }, + { + "epoch": 2.425763233033165, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5500893592834473, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7012979984283447, + "num_tokens": 571452419.0, + "step": 22089 + }, + { + "epoch": 2.4258730507357784, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.439821481704712, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7129336595535278, + "num_tokens": 571479041.0, + "step": 22090 + }, + { + "epoch": 2.425982868438392, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4911651611328125, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7273126244544983, + "num_tokens": 571501627.0, + "step": 22091 + }, + { + "epoch": 2.426092686141006, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4316985607147217, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7128671407699585, + "num_tokens": 571527922.0, + "step": 22092 + }, + { + "epoch": 2.4262025038436197, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.549382209777832, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7386458516120911, + "num_tokens": 571552399.0, + "step": 22093 + }, + { + "epoch": 2.4263123215462334, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.461294174194336, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.717857837677002, + "num_tokens": 571577848.0, + "step": 22094 + }, + { + "epoch": 2.4264221392488468, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.278608560562134, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.7032090425491333, + "num_tokens": 571607660.0, + "step": 22095 + }, + { + "epoch": 2.4265319569514605, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5343754291534424, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7196443676948547, + "num_tokens": 571633207.0, + "step": 22096 + }, + { + "epoch": 2.4266417746540743, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.389176607131958, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.7034933567047119, + "num_tokens": 571660053.0, + "step": 22097 + }, + { + "epoch": 2.426751592356688, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.330683708190918, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7162134647369385, + "num_tokens": 571689648.0, + "step": 22098 + }, + { + "epoch": 2.426861410059302, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4711644649505615, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6916594505310059, + "num_tokens": 571718070.0, + "step": 22099 + }, + { + "epoch": 2.426971227761915, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.08647084236145, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7146532535552979, + "num_tokens": 571755031.0, + "step": 22100 + }, + { + "epoch": 2.427081045464529, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4502673149108887, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.715458869934082, + "num_tokens": 571779782.0, + "step": 22101 + }, + { + "epoch": 2.4271908631671426, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.673485279083252, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7336167097091675, + "num_tokens": 571800827.0, + "step": 22102 + }, + { + "epoch": 2.4273006808697564, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.54236102104187, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7078967094421387, + "num_tokens": 571823948.0, + "step": 22103 + }, + { + "epoch": 2.4274104985723697, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.865389347076416, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7271824479103088, + "num_tokens": 571844500.0, + "step": 22104 + }, + { + "epoch": 2.4275203162749834, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.431020736694336, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7066789269447327, + "num_tokens": 571871878.0, + "step": 22105 + }, + { + "epoch": 2.427630133977597, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4181430339813232, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7018513679504395, + "num_tokens": 571899440.0, + "step": 22106 + }, + { + "epoch": 2.427739951680211, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.32542085647583, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7161912322044373, + "num_tokens": 571930948.0, + "step": 22107 + }, + { + "epoch": 2.4278497693828243, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.381490468978882, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.7073464393615723, + "num_tokens": 571960027.0, + "step": 22108 + }, + { + "epoch": 2.427959587085438, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3759589195251465, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6826076507568359, + "num_tokens": 571989873.0, + "step": 22109 + }, + { + "epoch": 2.4280694047880518, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5552093982696533, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7102798819541931, + "num_tokens": 572017896.0, + "step": 22110 + }, + { + "epoch": 2.4281792224906655, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.573700428009033, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7108152508735657, + "num_tokens": 572042412.0, + "step": 22111 + }, + { + "epoch": 2.4282890401932793, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4740583896636963, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7304087281227112, + "num_tokens": 572068675.0, + "step": 22112 + }, + { + "epoch": 2.4283988578958926, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.5193798542022705, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7150496244430542, + "num_tokens": 572092858.0, + "step": 22113 + }, + { + "epoch": 2.4285086755985064, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2780113220214844, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7212165594100952, + "num_tokens": 572122554.0, + "step": 22114 + }, + { + "epoch": 2.42861849330112, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5035178661346436, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7083498239517212, + "num_tokens": 572145919.0, + "step": 22115 + }, + { + "epoch": 2.428728311003734, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2304983139038086, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7059189081192017, + "num_tokens": 572174593.0, + "step": 22116 + }, + { + "epoch": 2.4288381287063476, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.9346976280212402, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7212256193161011, + "num_tokens": 572194188.0, + "step": 22117 + }, + { + "epoch": 2.428947946408961, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7665374279022217, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7206280827522278, + "num_tokens": 572213760.0, + "step": 22118 + }, + { + "epoch": 2.4290577641115747, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3873090744018555, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7048749327659607, + "num_tokens": 572240846.0, + "step": 22119 + }, + { + "epoch": 2.4291675818141885, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8932230472564697, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.718522310256958, + "num_tokens": 572260410.0, + "step": 22120 + }, + { + "epoch": 2.429277399516802, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4549615383148193, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6983678936958313, + "num_tokens": 572288503.0, + "step": 22121 + }, + { + "epoch": 2.429387217219416, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4510679244995117, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7089285254478455, + "num_tokens": 572314762.0, + "step": 22122 + }, + { + "epoch": 2.4294970349220293, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.491227388381958, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7111477255821228, + "num_tokens": 572338449.0, + "step": 22123 + }, + { + "epoch": 2.429606852624643, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2254364490509033, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.709044337272644, + "num_tokens": 572371814.0, + "step": 22124 + }, + { + "epoch": 2.429716670327257, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5026371479034424, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.700954794883728, + "num_tokens": 572401330.0, + "step": 22125 + }, + { + "epoch": 2.4298264880298706, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 4.291849136352539, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7267574667930603, + "num_tokens": 572431993.0, + "step": 22126 + }, + { + "epoch": 2.4299363057324843, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2356340885162354, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7359195947647095, + "num_tokens": 572461968.0, + "step": 22127 + }, + { + "epoch": 2.4300461234350976, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4905874729156494, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6950714588165283, + "num_tokens": 572487597.0, + "step": 22128 + }, + { + "epoch": 2.4301559411377114, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3012807369232178, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7225064039230347, + "num_tokens": 572513688.0, + "step": 22129 + }, + { + "epoch": 2.430265758840325, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4567553997039795, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.710184633731842, + "num_tokens": 572537892.0, + "step": 22130 + }, + { + "epoch": 2.430375576542939, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.481764554977417, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7248625755310059, + "num_tokens": 572562504.0, + "step": 22131 + }, + { + "epoch": 2.430485394245552, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.734869956970215, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.750065267086029, + "num_tokens": 572585240.0, + "step": 22132 + }, + { + "epoch": 2.430595211948166, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.67364501953125, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7092612385749817, + "num_tokens": 572608644.0, + "step": 22133 + }, + { + "epoch": 2.4307050296507797, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.451390027999878, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.730076789855957, + "num_tokens": 572634180.0, + "step": 22134 + }, + { + "epoch": 2.4308148473533935, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6981539726257324, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7147037386894226, + "num_tokens": 572655918.0, + "step": 22135 + }, + { + "epoch": 2.430924665056007, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.292966604232788, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7243760228157043, + "num_tokens": 572683866.0, + "step": 22136 + }, + { + "epoch": 2.4310344827586206, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6092827320098877, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7331527471542358, + "num_tokens": 572706081.0, + "step": 22137 + }, + { + "epoch": 2.4311443004612343, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5917460918426514, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7314698696136475, + "num_tokens": 572729135.0, + "step": 22138 + }, + { + "epoch": 2.431254118163848, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.634162187576294, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7425550818443298, + "num_tokens": 572752702.0, + "step": 22139 + }, + { + "epoch": 2.431363935866462, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.476646900177002, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7233165502548218, + "num_tokens": 572778951.0, + "step": 22140 + }, + { + "epoch": 2.431473753569075, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.84616756439209, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.755767822265625, + "num_tokens": 572797376.0, + "step": 22141 + }, + { + "epoch": 2.431583571271689, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.539309024810791, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.709384560585022, + "num_tokens": 572821225.0, + "step": 22142 + }, + { + "epoch": 2.4316933889743026, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.310913562774658, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7296247482299805, + "num_tokens": 572851640.0, + "step": 22143 + }, + { + "epoch": 2.4318032066769164, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4270458221435547, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7043401002883911, + "num_tokens": 572876784.0, + "step": 22144 + }, + { + "epoch": 2.43191302437953, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6683640480041504, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.72053062915802, + "num_tokens": 572901633.0, + "step": 22145 + }, + { + "epoch": 2.4320228420821435, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5393106937408447, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7065869569778442, + "num_tokens": 572925554.0, + "step": 22146 + }, + { + "epoch": 2.4321326597847572, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8761773109436035, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7381635308265686, + "num_tokens": 572945689.0, + "step": 22147 + }, + { + "epoch": 2.432242477487371, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.494175672531128, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7189902067184448, + "num_tokens": 572969929.0, + "step": 22148 + }, + { + "epoch": 2.4323522951899847, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4713449478149414, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7294800281524658, + "num_tokens": 572996094.0, + "step": 22149 + }, + { + "epoch": 2.4324621128925985, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 3.0021233558654785, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7276644706726074, + "num_tokens": 573013678.0, + "step": 22150 + }, + { + "epoch": 2.432571930595212, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.313990831375122, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7066360712051392, + "num_tokens": 573044076.0, + "step": 22151 + }, + { + "epoch": 2.4326817482978256, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.284806489944458, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7007976770401001, + "num_tokens": 573075911.0, + "step": 22152 + }, + { + "epoch": 2.4327915660004393, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.316000461578369, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.703228235244751, + "num_tokens": 573104044.0, + "step": 22153 + }, + { + "epoch": 2.432901383703053, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.595015287399292, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7073049545288086, + "num_tokens": 573128751.0, + "step": 22154 + }, + { + "epoch": 2.4330112014056664, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.184793472290039, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7378925085067749, + "num_tokens": 573161004.0, + "step": 22155 + }, + { + "epoch": 2.43312101910828, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.601877450942993, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7289937734603882, + "num_tokens": 573183890.0, + "step": 22156 + }, + { + "epoch": 2.433230836810894, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2171027660369873, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6936922073364258, + "num_tokens": 573214899.0, + "step": 22157 + }, + { + "epoch": 2.4333406545135077, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2478299140930176, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7095389366149902, + "num_tokens": 573244349.0, + "step": 22158 + }, + { + "epoch": 2.4334504722161214, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.505807876586914, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7175678014755249, + "num_tokens": 573266804.0, + "step": 22159 + }, + { + "epoch": 2.4335602899187347, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.906947135925293, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7190395593643188, + "num_tokens": 573285840.0, + "step": 22160 + }, + { + "epoch": 2.4336701076213485, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3183915615081787, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7084667086601257, + "num_tokens": 573313675.0, + "step": 22161 + }, + { + "epoch": 2.4337799253239623, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.416482925415039, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.713621973991394, + "num_tokens": 573341634.0, + "step": 22162 + }, + { + "epoch": 2.433889743026576, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1715407371520996, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6974565982818604, + "num_tokens": 573372386.0, + "step": 22163 + }, + { + "epoch": 2.4339995607291893, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5964086055755615, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7337188124656677, + "num_tokens": 573393531.0, + "step": 22164 + }, + { + "epoch": 2.434109378431803, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4796502590179443, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7240101099014282, + "num_tokens": 573421354.0, + "step": 22165 + }, + { + "epoch": 2.434219196134417, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2625815868377686, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6966909170150757, + "num_tokens": 573450837.0, + "step": 22166 + }, + { + "epoch": 2.4343290138370306, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.620129108428955, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7250677347183228, + "num_tokens": 573472620.0, + "step": 22167 + }, + { + "epoch": 2.4344388315396444, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4879231452941895, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7001070380210876, + "num_tokens": 573499618.0, + "step": 22168 + }, + { + "epoch": 2.4345486492422577, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4259495735168457, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7254050970077515, + "num_tokens": 573525270.0, + "step": 22169 + }, + { + "epoch": 2.4346584669448714, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2596168518066406, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.6837739944458008, + "num_tokens": 573558375.0, + "step": 22170 + }, + { + "epoch": 2.434768284647485, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.585411548614502, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7118347883224487, + "num_tokens": 573580819.0, + "step": 22171 + }, + { + "epoch": 2.434878102350099, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6526834964752197, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.704343318939209, + "num_tokens": 573603032.0, + "step": 22172 + }, + { + "epoch": 2.4349879200527127, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7151002883911133, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7353599071502686, + "num_tokens": 573624413.0, + "step": 22173 + }, + { + "epoch": 2.435097737755326, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.54154634475708, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7170383334159851, + "num_tokens": 573647899.0, + "step": 22174 + }, + { + "epoch": 2.4352075554579398, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.659123659133911, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6912437677383423, + "num_tokens": 573672654.0, + "step": 22175 + }, + { + "epoch": 2.4353173731605535, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7328379154205322, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7479537725448608, + "num_tokens": 573692131.0, + "step": 22176 + }, + { + "epoch": 2.4354271908631673, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.93574857711792, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7093122005462646, + "num_tokens": 573712666.0, + "step": 22177 + }, + { + "epoch": 2.435537008565781, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8517534732818604, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7361124753952026, + "num_tokens": 573732616.0, + "step": 22178 + }, + { + "epoch": 2.4356468262683943, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5054409503936768, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7177273035049438, + "num_tokens": 573755985.0, + "step": 22179 + }, + { + "epoch": 2.435756643971008, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3789329528808594, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6960318088531494, + "num_tokens": 573781635.0, + "step": 22180 + }, + { + "epoch": 2.435866461673622, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5822882652282715, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7417478561401367, + "num_tokens": 573803827.0, + "step": 22181 + }, + { + "epoch": 2.4359762793762356, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4578285217285156, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.736840546131134, + "num_tokens": 573828707.0, + "step": 22182 + }, + { + "epoch": 2.436086097078849, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.15959095954895, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7362020015716553, + "num_tokens": 573858685.0, + "step": 22183 + }, + { + "epoch": 2.4361959147814627, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5617291927337646, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7148889303207397, + "num_tokens": 573885476.0, + "step": 22184 + }, + { + "epoch": 2.4363057324840764, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5293538570404053, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7158020734786987, + "num_tokens": 573910411.0, + "step": 22185 + }, + { + "epoch": 2.43641555018669, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.221743583679199, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.724982738494873, + "num_tokens": 573940680.0, + "step": 22186 + }, + { + "epoch": 2.4365253678893035, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.10974383354187, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7272857427597046, + "num_tokens": 573974782.0, + "step": 22187 + }, + { + "epoch": 2.4366351855919173, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.455695629119873, + "learning_rate": 1e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6858882904052734, + "num_tokens": 574001647.0, + "step": 22188 + }, + { + "epoch": 2.436745003294531, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4795594215393066, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7086523175239563, + "num_tokens": 574027374.0, + "step": 22189 + }, + { + "epoch": 2.436854820997145, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8728699684143066, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7196453213691711, + "num_tokens": 574047848.0, + "step": 22190 + }, + { + "epoch": 2.4369646386997585, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 8.564713478088379, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7259604930877686, + "num_tokens": 574071840.0, + "step": 22191 + }, + { + "epoch": 2.437074456402372, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6222715377807617, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7060128450393677, + "num_tokens": 574096632.0, + "step": 22192 + }, + { + "epoch": 2.4371842741049856, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.592233896255493, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7128090858459473, + "num_tokens": 574121632.0, + "step": 22193 + }, + { + "epoch": 2.4372940918075994, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.709312677383423, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7250047922134399, + "num_tokens": 574143569.0, + "step": 22194 + }, + { + "epoch": 2.437403909510213, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.719207286834717, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7267295122146606, + "num_tokens": 574165403.0, + "step": 22195 + }, + { + "epoch": 2.437513727212827, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4537250995635986, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7072262763977051, + "num_tokens": 574192232.0, + "step": 22196 + }, + { + "epoch": 2.43762354491544, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.484663963317871, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7228224277496338, + "num_tokens": 574218583.0, + "step": 22197 + }, + { + "epoch": 2.437733362618054, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3437888622283936, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7158562541007996, + "num_tokens": 574247284.0, + "step": 22198 + }, + { + "epoch": 2.4378431803206677, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.447002649307251, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7131503820419312, + "num_tokens": 574272439.0, + "step": 22199 + }, + { + "epoch": 2.4379529980232815, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.336265802383423, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7217572331428528, + "num_tokens": 574300708.0, + "step": 22200 + }, + { + "epoch": 2.4380628157258952, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3561294078826904, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7083041071891785, + "num_tokens": 574329878.0, + "step": 22201 + }, + { + "epoch": 2.4381726334285085, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2403905391693115, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7138062119483948, + "num_tokens": 574362905.0, + "step": 22202 + }, + { + "epoch": 2.4382824511311223, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3302066326141357, + "learning_rate": 1e-06, + "loss": 1.0934, + "mean_token_accuracy": 0.6841676831245422, + "num_tokens": 574389659.0, + "step": 22203 + }, + { + "epoch": 2.438392268833736, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4181597232818604, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.707518458366394, + "num_tokens": 574415611.0, + "step": 22204 + }, + { + "epoch": 2.43850208653635, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2141854763031006, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.7007943391799927, + "num_tokens": 574444974.0, + "step": 22205 + }, + { + "epoch": 2.4386119042389636, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5513949394226074, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7287790775299072, + "num_tokens": 574467654.0, + "step": 22206 + }, + { + "epoch": 2.438721721941577, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8442039489746094, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7346043586730957, + "num_tokens": 574487929.0, + "step": 22207 + }, + { + "epoch": 2.4388315396441906, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.644927501678467, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.719017744064331, + "num_tokens": 574510732.0, + "step": 22208 + }, + { + "epoch": 2.4389413573468044, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3988637924194336, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7101708650588989, + "num_tokens": 574536138.0, + "step": 22209 + }, + { + "epoch": 2.439051175049418, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3117892742156982, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7249982357025146, + "num_tokens": 574563499.0, + "step": 22210 + }, + { + "epoch": 2.4391609927520315, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.353895664215088, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7051154375076294, + "num_tokens": 574589829.0, + "step": 22211 + }, + { + "epoch": 2.439270810454645, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.702925443649292, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7207256555557251, + "num_tokens": 574611065.0, + "step": 22212 + }, + { + "epoch": 2.439380628157259, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4013211727142334, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6954534649848938, + "num_tokens": 574637669.0, + "step": 22213 + }, + { + "epoch": 2.4394904458598727, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.9879748821258545, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7384421825408936, + "num_tokens": 574656869.0, + "step": 22214 + }, + { + "epoch": 2.439600263562486, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.499140977859497, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7144073247909546, + "num_tokens": 574684604.0, + "step": 22215 + }, + { + "epoch": 2.4397100812651, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6150691509246826, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6986453533172607, + "num_tokens": 574708139.0, + "step": 22216 + }, + { + "epoch": 2.4398198989677136, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.445282459259033, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7497419118881226, + "num_tokens": 574732374.0, + "step": 22217 + }, + { + "epoch": 2.4399297166703273, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.475294351577759, + "learning_rate": 1e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.6909654140472412, + "num_tokens": 574760582.0, + "step": 22218 + }, + { + "epoch": 2.440039534372941, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4592764377593994, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7209633588790894, + "num_tokens": 574786011.0, + "step": 22219 + }, + { + "epoch": 2.4401493520755544, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.477574586868286, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7115030288696289, + "num_tokens": 574812478.0, + "step": 22220 + }, + { + "epoch": 2.440259169778168, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4033043384552, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7234850525856018, + "num_tokens": 574838868.0, + "step": 22221 + }, + { + "epoch": 2.440368987480782, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.31107759475708, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6973884105682373, + "num_tokens": 574867871.0, + "step": 22222 + }, + { + "epoch": 2.4404788051833957, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.688779354095459, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7147899866104126, + "num_tokens": 574890716.0, + "step": 22223 + }, + { + "epoch": 2.4405886228860094, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.550940990447998, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7173870801925659, + "num_tokens": 574916637.0, + "step": 22224 + }, + { + "epoch": 2.4406984405886227, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.433990240097046, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7205807566642761, + "num_tokens": 574943381.0, + "step": 22225 + }, + { + "epoch": 2.4408082582912365, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.242658853530884, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7109140157699585, + "num_tokens": 574972351.0, + "step": 22226 + }, + { + "epoch": 2.4409180759938502, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.343230724334717, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7238281965255737, + "num_tokens": 575000134.0, + "step": 22227 + }, + { + "epoch": 2.441027893696464, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1349360942840576, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7146008014678955, + "num_tokens": 575035356.0, + "step": 22228 + }, + { + "epoch": 2.4411377113990778, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6460163593292236, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7157412767410278, + "num_tokens": 575057505.0, + "step": 22229 + }, + { + "epoch": 2.441247529101691, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4546940326690674, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7269958853721619, + "num_tokens": 575081014.0, + "step": 22230 + }, + { + "epoch": 2.441357346804305, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1544644832611084, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.68497633934021, + "num_tokens": 575115581.0, + "step": 22231 + }, + { + "epoch": 2.4414671645069186, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6223483085632324, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7058906555175781, + "num_tokens": 575139436.0, + "step": 22232 + }, + { + "epoch": 2.4415769822095323, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2072958946228027, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6971815228462219, + "num_tokens": 575171593.0, + "step": 22233 + }, + { + "epoch": 2.4416867999121457, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.463066339492798, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7277137041091919, + "num_tokens": 575197302.0, + "step": 22234 + }, + { + "epoch": 2.4417966176147594, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3212649822235107, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7125039100646973, + "num_tokens": 575225667.0, + "step": 22235 + }, + { + "epoch": 2.441906435317373, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3989505767822266, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7152228355407715, + "num_tokens": 575250392.0, + "step": 22236 + }, + { + "epoch": 2.442016253019987, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.76348876953125, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7141531705856323, + "num_tokens": 575270867.0, + "step": 22237 + }, + { + "epoch": 2.4421260707226002, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1622707843780518, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6983616352081299, + "num_tokens": 575303943.0, + "step": 22238 + }, + { + "epoch": 2.442235888425214, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.747687816619873, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7186855673789978, + "num_tokens": 575325586.0, + "step": 22239 + }, + { + "epoch": 2.4423457061278278, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8414628505706787, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7210705280303955, + "num_tokens": 575345010.0, + "step": 22240 + }, + { + "epoch": 2.4424555238304415, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.426429510116577, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6888784170150757, + "num_tokens": 575371827.0, + "step": 22241 + }, + { + "epoch": 2.4425653415330553, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.998420238494873, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.727025032043457, + "num_tokens": 575397851.0, + "step": 22242 + }, + { + "epoch": 2.4426751592356686, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6539950370788574, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7087826728820801, + "num_tokens": 575420803.0, + "step": 22243 + }, + { + "epoch": 2.4427849769382823, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.829606771469116, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7293614149093628, + "num_tokens": 575441226.0, + "step": 22244 + }, + { + "epoch": 2.442894794640896, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.392420530319214, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.711739718914032, + "num_tokens": 575467323.0, + "step": 22245 + }, + { + "epoch": 2.44300461234351, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4699618816375732, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7144607305526733, + "num_tokens": 575492186.0, + "step": 22246 + }, + { + "epoch": 2.4431144300461236, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.646812677383423, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7212924957275391, + "num_tokens": 575512859.0, + "step": 22247 + }, + { + "epoch": 2.443224247748737, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6236352920532227, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7284947633743286, + "num_tokens": 575534950.0, + "step": 22248 + }, + { + "epoch": 2.4433340654513507, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4157090187072754, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7317295074462891, + "num_tokens": 575560010.0, + "step": 22249 + }, + { + "epoch": 2.4434438831539644, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4161252975463867, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6978673934936523, + "num_tokens": 575587780.0, + "step": 22250 + }, + { + "epoch": 2.443553700856578, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.425478219985962, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7045395374298096, + "num_tokens": 575613729.0, + "step": 22251 + }, + { + "epoch": 2.443663518559192, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.440199851989746, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6764070987701416, + "num_tokens": 575641897.0, + "step": 22252 + }, + { + "epoch": 2.4437733362618053, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.506838321685791, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7212543487548828, + "num_tokens": 575663939.0, + "step": 22253 + }, + { + "epoch": 2.443883153964419, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4675445556640625, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7054492235183716, + "num_tokens": 575693398.0, + "step": 22254 + }, + { + "epoch": 2.4439929716670328, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6119425296783447, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7186281681060791, + "num_tokens": 575716003.0, + "step": 22255 + }, + { + "epoch": 2.4441027893696465, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6255693435668945, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7183005809783936, + "num_tokens": 575737645.0, + "step": 22256 + }, + { + "epoch": 2.4442126070722603, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.532878875732422, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7185494899749756, + "num_tokens": 575764482.0, + "step": 22257 + }, + { + "epoch": 2.4443224247748736, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.752457618713379, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7050644159317017, + "num_tokens": 575785339.0, + "step": 22258 + }, + { + "epoch": 2.4444322424774874, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4276580810546875, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7420691251754761, + "num_tokens": 575810353.0, + "step": 22259 + }, + { + "epoch": 2.444542060180101, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4933700561523438, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7086778879165649, + "num_tokens": 575837686.0, + "step": 22260 + }, + { + "epoch": 2.444651877882715, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2629127502441406, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6916711330413818, + "num_tokens": 575873131.0, + "step": 22261 + }, + { + "epoch": 2.444761695585328, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.3132293224334717, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6950687170028687, + "num_tokens": 575901282.0, + "step": 22262 + }, + { + "epoch": 2.444871513287942, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.729022979736328, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7169390320777893, + "num_tokens": 575923327.0, + "step": 22263 + }, + { + "epoch": 2.4449813309905557, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2834033966064453, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.7018156051635742, + "num_tokens": 575955089.0, + "step": 22264 + }, + { + "epoch": 2.4450911486931695, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3802387714385986, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6933966875076294, + "num_tokens": 575982716.0, + "step": 22265 + }, + { + "epoch": 2.4452009663957828, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.322711706161499, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6909220814704895, + "num_tokens": 576012552.0, + "step": 22266 + }, + { + "epoch": 2.4453107840983965, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7971031665802, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7232874631881714, + "num_tokens": 576033064.0, + "step": 22267 + }, + { + "epoch": 2.4454206018010103, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2569055557250977, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6991667747497559, + "num_tokens": 576063360.0, + "step": 22268 + }, + { + "epoch": 2.445530419503624, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4478471279144287, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.733191728591919, + "num_tokens": 576088138.0, + "step": 22269 + }, + { + "epoch": 2.445640237206238, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.129781723022461, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6927667856216431, + "num_tokens": 576120804.0, + "step": 22270 + }, + { + "epoch": 2.445750054908851, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.295492172241211, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7173718810081482, + "num_tokens": 576146306.0, + "step": 22271 + }, + { + "epoch": 2.445859872611465, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.511852502822876, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7072230577468872, + "num_tokens": 576171014.0, + "step": 22272 + }, + { + "epoch": 2.4459696903140786, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.339909791946411, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7020269632339478, + "num_tokens": 576197866.0, + "step": 22273 + }, + { + "epoch": 2.4460795080166924, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4286415576934814, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6820050477981567, + "num_tokens": 576225642.0, + "step": 22274 + }, + { + "epoch": 2.446189325719306, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3346292972564697, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7188476324081421, + "num_tokens": 576253010.0, + "step": 22275 + }, + { + "epoch": 2.4462991434219195, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5218582153320312, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7073677182197571, + "num_tokens": 576279767.0, + "step": 22276 + }, + { + "epoch": 2.446408961124533, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3873507976531982, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6933834552764893, + "num_tokens": 576310196.0, + "step": 22277 + }, + { + "epoch": 2.446518778827147, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4854211807250977, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6910089254379272, + "num_tokens": 576336773.0, + "step": 22278 + }, + { + "epoch": 2.4466285965297607, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6823928356170654, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.714138925075531, + "num_tokens": 576358569.0, + "step": 22279 + }, + { + "epoch": 2.4467384142323745, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.351975440979004, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7178068161010742, + "num_tokens": 576386669.0, + "step": 22280 + }, + { + "epoch": 2.446848231934988, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.53727126121521, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.723241925239563, + "num_tokens": 576409761.0, + "step": 22281 + }, + { + "epoch": 2.4469580496376016, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.644474983215332, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7269032001495361, + "num_tokens": 576432019.0, + "step": 22282 + }, + { + "epoch": 2.4470678673402153, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6766953468322754, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7160007953643799, + "num_tokens": 576459040.0, + "step": 22283 + }, + { + "epoch": 2.447177685042829, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.381908893585205, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7319251298904419, + "num_tokens": 576485630.0, + "step": 22284 + }, + { + "epoch": 2.4472875027454424, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4235923290252686, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7273948788642883, + "num_tokens": 576510185.0, + "step": 22285 + }, + { + "epoch": 2.447397320448056, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5982701778411865, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.715458333492279, + "num_tokens": 576533097.0, + "step": 22286 + }, + { + "epoch": 2.44750713815067, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8476955890655518, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7195873856544495, + "num_tokens": 576552424.0, + "step": 22287 + }, + { + "epoch": 2.4476169558532836, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.666961908340454, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7313222289085388, + "num_tokens": 576572851.0, + "step": 22288 + }, + { + "epoch": 2.447726773555897, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.372774124145508, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7228531837463379, + "num_tokens": 576599655.0, + "step": 22289 + }, + { + "epoch": 2.4478365912585107, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2277936935424805, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7125256061553955, + "num_tokens": 576628544.0, + "step": 22290 + }, + { + "epoch": 2.4479464089611245, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.275299072265625, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.729412317276001, + "num_tokens": 576657072.0, + "step": 22291 + }, + { + "epoch": 2.4480562266637382, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3559658527374268, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7125697135925293, + "num_tokens": 576683784.0, + "step": 22292 + }, + { + "epoch": 2.448166044366352, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.637261152267456, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7282930612564087, + "num_tokens": 576706149.0, + "step": 22293 + }, + { + "epoch": 2.4482758620689653, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6071295738220215, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.709528923034668, + "num_tokens": 576727558.0, + "step": 22294 + }, + { + "epoch": 2.448385679771579, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.368931531906128, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7166398763656616, + "num_tokens": 576754030.0, + "step": 22295 + }, + { + "epoch": 2.448495497474193, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.327845811843872, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.716179370880127, + "num_tokens": 576782912.0, + "step": 22296 + }, + { + "epoch": 2.4486053151768066, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4478678703308105, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7169982194900513, + "num_tokens": 576807119.0, + "step": 22297 + }, + { + "epoch": 2.4487151328794203, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.655240535736084, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7106084227561951, + "num_tokens": 576830037.0, + "step": 22298 + }, + { + "epoch": 2.4488249505820336, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.616675615310669, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7503302097320557, + "num_tokens": 576851476.0, + "step": 22299 + }, + { + "epoch": 2.4489347682846474, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6686654090881348, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.7036337852478027, + "num_tokens": 576875256.0, + "step": 22300 + }, + { + "epoch": 2.449044585987261, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3089945316314697, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7126206755638123, + "num_tokens": 576905309.0, + "step": 22301 + }, + { + "epoch": 2.449154403689875, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5083818435668945, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7093501091003418, + "num_tokens": 576930590.0, + "step": 22302 + }, + { + "epoch": 2.4492642213924887, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6652278900146484, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7167178392410278, + "num_tokens": 576952543.0, + "step": 22303 + }, + { + "epoch": 2.449374039095102, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.465681552886963, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7063364386558533, + "num_tokens": 576978843.0, + "step": 22304 + }, + { + "epoch": 2.4494838567977157, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.455137252807617, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7064292430877686, + "num_tokens": 577004525.0, + "step": 22305 + }, + { + "epoch": 2.4495936745003295, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5504138469696045, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.716854453086853, + "num_tokens": 577028989.0, + "step": 22306 + }, + { + "epoch": 2.4497034922029433, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7094311714172363, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7337200045585632, + "num_tokens": 577049797.0, + "step": 22307 + }, + { + "epoch": 2.449813309905557, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5938913822174072, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7208223342895508, + "num_tokens": 577072989.0, + "step": 22308 + }, + { + "epoch": 2.4499231276081703, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5231635570526123, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7138475775718689, + "num_tokens": 577097721.0, + "step": 22309 + }, + { + "epoch": 2.450032945310784, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3379836082458496, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6935751438140869, + "num_tokens": 577125235.0, + "step": 22310 + }, + { + "epoch": 2.450142763013398, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.597618579864502, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.730441689491272, + "num_tokens": 577146621.0, + "step": 22311 + }, + { + "epoch": 2.4502525807160116, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.880155324935913, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7339039444923401, + "num_tokens": 577165943.0, + "step": 22312 + }, + { + "epoch": 2.450362398418625, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6741480827331543, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7041133046150208, + "num_tokens": 577187725.0, + "step": 22313 + }, + { + "epoch": 2.4504722161212387, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4319956302642822, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7248785495758057, + "num_tokens": 577214425.0, + "step": 22314 + }, + { + "epoch": 2.4505820338238524, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3856618404388428, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7202005386352539, + "num_tokens": 577241342.0, + "step": 22315 + }, + { + "epoch": 2.450691851526466, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5320825576782227, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7297776341438293, + "num_tokens": 577264693.0, + "step": 22316 + }, + { + "epoch": 2.4508016692290795, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.2532427310943604, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.72303706407547, + "num_tokens": 577293109.0, + "step": 22317 + }, + { + "epoch": 2.4509114869316933, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.391083240509033, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7179206609725952, + "num_tokens": 577322748.0, + "step": 22318 + }, + { + "epoch": 2.451021304634307, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.472323179244995, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7136050462722778, + "num_tokens": 577346093.0, + "step": 22319 + }, + { + "epoch": 2.4511311223369208, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2067205905914307, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7181000709533691, + "num_tokens": 577376360.0, + "step": 22320 + }, + { + "epoch": 2.4512409400395345, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6803722381591797, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7104718685150146, + "num_tokens": 577406450.0, + "step": 22321 + }, + { + "epoch": 2.451350757742148, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6781089305877686, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6993870735168457, + "num_tokens": 577428068.0, + "step": 22322 + }, + { + "epoch": 2.4514605754447616, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5415523052215576, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7239933013916016, + "num_tokens": 577451745.0, + "step": 22323 + }, + { + "epoch": 2.4515703931473753, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6366031169891357, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6962450742721558, + "num_tokens": 577473353.0, + "step": 22324 + }, + { + "epoch": 2.451680210849989, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 1.9993830919265747, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7090686559677124, + "num_tokens": 577511202.0, + "step": 22325 + }, + { + "epoch": 2.451790028552603, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2808260917663574, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7279700040817261, + "num_tokens": 577540374.0, + "step": 22326 + }, + { + "epoch": 2.451899846255216, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2088794708251953, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6910582780838013, + "num_tokens": 577571983.0, + "step": 22327 + }, + { + "epoch": 2.45200966395783, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.600480079650879, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7056366801261902, + "num_tokens": 577596579.0, + "step": 22328 + }, + { + "epoch": 2.4521194816604437, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2417547702789307, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7334588766098022, + "num_tokens": 577626159.0, + "step": 22329 + }, + { + "epoch": 2.4522292993630574, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2709107398986816, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7052492499351501, + "num_tokens": 577653838.0, + "step": 22330 + }, + { + "epoch": 2.452339117065671, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3410098552703857, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.709729790687561, + "num_tokens": 577681468.0, + "step": 22331 + }, + { + "epoch": 2.4524489347682845, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.332667112350464, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7094497680664062, + "num_tokens": 577711263.0, + "step": 22332 + }, + { + "epoch": 2.4525587524708983, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5503010749816895, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.718980073928833, + "num_tokens": 577734729.0, + "step": 22333 + }, + { + "epoch": 2.452668570173512, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.725276470184326, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7242067456245422, + "num_tokens": 577756749.0, + "step": 22334 + }, + { + "epoch": 2.452778387876126, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.252715587615967, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7115895748138428, + "num_tokens": 577785506.0, + "step": 22335 + }, + { + "epoch": 2.452888205578739, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.62900972366333, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7140787839889526, + "num_tokens": 577808889.0, + "step": 22336 + }, + { + "epoch": 2.452998023281353, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4532785415649414, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7010936737060547, + "num_tokens": 577837719.0, + "step": 22337 + }, + { + "epoch": 2.4531078409839666, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4313127994537354, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7190979719161987, + "num_tokens": 577863447.0, + "step": 22338 + }, + { + "epoch": 2.4532176586865804, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7105867862701416, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7111784219741821, + "num_tokens": 577883916.0, + "step": 22339 + }, + { + "epoch": 2.453327476389194, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.441743850708008, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6956446170806885, + "num_tokens": 577910642.0, + "step": 22340 + }, + { + "epoch": 2.4534372940918074, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3820273876190186, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7399699687957764, + "num_tokens": 577934895.0, + "step": 22341 + }, + { + "epoch": 2.453547111794421, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.5391783714294434, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7106223106384277, + "num_tokens": 577960405.0, + "step": 22342 + }, + { + "epoch": 2.453656929497035, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2199342250823975, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6811515092849731, + "num_tokens": 577991669.0, + "step": 22343 + }, + { + "epoch": 2.4537667471996487, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5353856086730957, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7367116212844849, + "num_tokens": 578012901.0, + "step": 22344 + }, + { + "epoch": 2.453876564902262, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.751540422439575, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7326614260673523, + "num_tokens": 578033330.0, + "step": 22345 + }, + { + "epoch": 2.453986382604876, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.8379130363464355, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7303536534309387, + "num_tokens": 578053690.0, + "step": 22346 + }, + { + "epoch": 2.4540962003074895, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.343914270401001, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.714600682258606, + "num_tokens": 578084004.0, + "step": 22347 + }, + { + "epoch": 2.4542060180101033, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4326112270355225, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7243999242782593, + "num_tokens": 578109929.0, + "step": 22348 + }, + { + "epoch": 2.454315835712717, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2229087352752686, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7062069177627563, + "num_tokens": 578143144.0, + "step": 22349 + }, + { + "epoch": 2.4544256534153304, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2431657314300537, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7064539194107056, + "num_tokens": 578172629.0, + "step": 22350 + }, + { + "epoch": 2.454535471117944, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.424574375152588, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7199161648750305, + "num_tokens": 578196976.0, + "step": 22351 + }, + { + "epoch": 2.454645288820558, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.283057928085327, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.7019342184066772, + "num_tokens": 578226395.0, + "step": 22352 + }, + { + "epoch": 2.4547551065231716, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.2992050647735596, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7094254493713379, + "num_tokens": 578254367.0, + "step": 22353 + }, + { + "epoch": 2.4548649242257854, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.449890613555908, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7146669030189514, + "num_tokens": 578279572.0, + "step": 22354 + }, + { + "epoch": 2.4549747419283987, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5268380641937256, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7584768533706665, + "num_tokens": 578302458.0, + "step": 22355 + }, + { + "epoch": 2.4550845596310125, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.262877941131592, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6993948221206665, + "num_tokens": 578333129.0, + "step": 22356 + }, + { + "epoch": 2.455194377333626, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.438913583755493, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7301291823387146, + "num_tokens": 578357560.0, + "step": 22357 + }, + { + "epoch": 2.45530419503624, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.288666248321533, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7287180423736572, + "num_tokens": 578384080.0, + "step": 22358 + }, + { + "epoch": 2.4554140127388537, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3513236045837402, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6980069875717163, + "num_tokens": 578413855.0, + "step": 22359 + }, + { + "epoch": 2.455523830441467, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.408478260040283, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7150443196296692, + "num_tokens": 578441063.0, + "step": 22360 + }, + { + "epoch": 2.455633648144081, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3144304752349854, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6979520320892334, + "num_tokens": 578469577.0, + "step": 22361 + }, + { + "epoch": 2.4557434658466946, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5706639289855957, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7313848733901978, + "num_tokens": 578492684.0, + "step": 22362 + }, + { + "epoch": 2.4558532835493083, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.518616199493408, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7164758443832397, + "num_tokens": 578516977.0, + "step": 22363 + }, + { + "epoch": 2.4559631012519216, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.54738450050354, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6990989446640015, + "num_tokens": 578542266.0, + "step": 22364 + }, + { + "epoch": 2.4560729189545354, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5516457557678223, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7206476330757141, + "num_tokens": 578564821.0, + "step": 22365 + }, + { + "epoch": 2.456182736657149, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.791898488998413, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7176332473754883, + "num_tokens": 578586215.0, + "step": 22366 + }, + { + "epoch": 2.456292554359763, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5075314044952393, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.708545446395874, + "num_tokens": 578612609.0, + "step": 22367 + }, + { + "epoch": 2.456402372062376, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4958443641662598, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.694005012512207, + "num_tokens": 578637678.0, + "step": 22368 + }, + { + "epoch": 2.45651218976499, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.169496536254883, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7007243037223816, + "num_tokens": 578669710.0, + "step": 22369 + }, + { + "epoch": 2.4566220074676037, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5155200958251953, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7325032353401184, + "num_tokens": 578693789.0, + "step": 22370 + }, + { + "epoch": 2.4567318251702175, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1986265182495117, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7110522985458374, + "num_tokens": 578724447.0, + "step": 22371 + }, + { + "epoch": 2.4568416428728312, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3669052124023438, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.7101606726646423, + "num_tokens": 578750989.0, + "step": 22372 + }, + { + "epoch": 2.4569514605754446, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.66313099861145, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7279892563819885, + "num_tokens": 578772778.0, + "step": 22373 + }, + { + "epoch": 2.4570612782780583, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.324281692504883, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.703553318977356, + "num_tokens": 578800895.0, + "step": 22374 + }, + { + "epoch": 2.457171095980672, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2908616065979004, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7387324571609497, + "num_tokens": 578829230.0, + "step": 22375 + }, + { + "epoch": 2.457280913683286, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3223013877868652, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7014518976211548, + "num_tokens": 578860437.0, + "step": 22376 + }, + { + "epoch": 2.4573907313858996, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3722617626190186, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6915697455406189, + "num_tokens": 578887887.0, + "step": 22377 + }, + { + "epoch": 2.457500549088513, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3419747352600098, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7016526460647583, + "num_tokens": 578916142.0, + "step": 22378 + }, + { + "epoch": 2.4576103667911267, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.339890480041504, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7172455191612244, + "num_tokens": 578943239.0, + "step": 22379 + }, + { + "epoch": 2.4577201844937404, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.334368944168091, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.726641058921814, + "num_tokens": 578970595.0, + "step": 22380 + }, + { + "epoch": 2.457830002196354, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3828771114349365, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7118740081787109, + "num_tokens": 578999841.0, + "step": 22381 + }, + { + "epoch": 2.457939819898968, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3399581909179688, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6810444593429565, + "num_tokens": 579028000.0, + "step": 22382 + }, + { + "epoch": 2.4580496376015812, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.598013162612915, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7104973793029785, + "num_tokens": 579051268.0, + "step": 22383 + }, + { + "epoch": 2.458159455304195, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3231117725372314, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6922516822814941, + "num_tokens": 579079968.0, + "step": 22384 + }, + { + "epoch": 2.4582692730068088, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.711353302001953, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7175024747848511, + "num_tokens": 579100722.0, + "step": 22385 + }, + { + "epoch": 2.4583790907094225, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3413994312286377, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7103197574615479, + "num_tokens": 579129843.0, + "step": 22386 + }, + { + "epoch": 2.4584889084120363, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1214540004730225, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7071319222450256, + "num_tokens": 579164875.0, + "step": 22387 + }, + { + "epoch": 2.4585987261146496, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5026841163635254, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6997570991516113, + "num_tokens": 579190306.0, + "step": 22388 + }, + { + "epoch": 2.4587085438172633, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.779690980911255, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7491967082023621, + "num_tokens": 579209165.0, + "step": 22389 + }, + { + "epoch": 2.458818361519877, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4906375408172607, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6917256712913513, + "num_tokens": 579236012.0, + "step": 22390 + }, + { + "epoch": 2.458928179222491, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.2905490398406982, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6959800124168396, + "num_tokens": 579266172.0, + "step": 22391 + }, + { + "epoch": 2.459037996925104, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4044697284698486, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7178202867507935, + "num_tokens": 579293066.0, + "step": 22392 + }, + { + "epoch": 2.459147814627718, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.378769874572754, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7064076662063599, + "num_tokens": 579318838.0, + "step": 22393 + }, + { + "epoch": 2.4592576323303317, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.104921340942383, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.7011725902557373, + "num_tokens": 579352649.0, + "step": 22394 + }, + { + "epoch": 2.4593674500329454, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 7.107807159423828, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7243316769599915, + "num_tokens": 579376862.0, + "step": 22395 + }, + { + "epoch": 2.4594772677355587, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.3267252445220947, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7292948961257935, + "num_tokens": 579404419.0, + "step": 22396 + }, + { + "epoch": 2.4595870854381725, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5316498279571533, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7137881517410278, + "num_tokens": 579431207.0, + "step": 22397 + }, + { + "epoch": 2.4596969031407863, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.154513359069824, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7149143218994141, + "num_tokens": 579463351.0, + "step": 22398 + }, + { + "epoch": 2.4598067208434, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3804931640625, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7252135872840881, + "num_tokens": 579490210.0, + "step": 22399 + }, + { + "epoch": 2.4599165385460138, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4233503341674805, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7429596185684204, + "num_tokens": 579514549.0, + "step": 22400 + }, + { + "epoch": 2.460026356248627, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.5574846267700195, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7156208753585815, + "num_tokens": 579538738.0, + "step": 22401 + }, + { + "epoch": 2.460136173951241, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.317523956298828, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7033357620239258, + "num_tokens": 579568753.0, + "step": 22402 + }, + { + "epoch": 2.4602459916538546, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.471006393432617, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7269229888916016, + "num_tokens": 579593085.0, + "step": 22403 + }, + { + "epoch": 2.4603558093564684, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6356759071350098, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6966991424560547, + "num_tokens": 579615851.0, + "step": 22404 + }, + { + "epoch": 2.460465627059082, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.332397222518921, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7339777946472168, + "num_tokens": 579641659.0, + "step": 22405 + }, + { + "epoch": 2.4605754447616954, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6543400287628174, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7207590341567993, + "num_tokens": 579665632.0, + "step": 22406 + }, + { + "epoch": 2.460685262464309, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.395674705505371, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7394979000091553, + "num_tokens": 579689755.0, + "step": 22407 + }, + { + "epoch": 2.460795080166923, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.551851987838745, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7227217555046082, + "num_tokens": 579713267.0, + "step": 22408 + }, + { + "epoch": 2.4609048978695367, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4428257942199707, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6959890127182007, + "num_tokens": 579739361.0, + "step": 22409 + }, + { + "epoch": 2.4610147155721505, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.6498589515686035, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6972006559371948, + "num_tokens": 579761589.0, + "step": 22410 + }, + { + "epoch": 2.4611245332747638, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.4506990909576416, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.7018463015556335, + "num_tokens": 579788253.0, + "step": 22411 + }, + { + "epoch": 2.4612343509773775, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.294189214706421, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6913747191429138, + "num_tokens": 579817323.0, + "step": 22412 + }, + { + "epoch": 2.4613441686799913, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4771580696105957, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7158850431442261, + "num_tokens": 579840291.0, + "step": 22413 + }, + { + "epoch": 2.461453986382605, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2694337368011475, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6893818378448486, + "num_tokens": 579872806.0, + "step": 22414 + }, + { + "epoch": 2.4615638040852184, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.239061117172241, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.708404541015625, + "num_tokens": 579905609.0, + "step": 22415 + }, + { + "epoch": 2.461673621787832, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.3159639835357666, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7114670872688293, + "num_tokens": 579934455.0, + "step": 22416 + }, + { + "epoch": 2.461783439490446, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.7391135692596436, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7306890487670898, + "num_tokens": 579956473.0, + "step": 22417 + }, + { + "epoch": 2.4618932571930596, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3617496490478516, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6971957683563232, + "num_tokens": 579983569.0, + "step": 22418 + }, + { + "epoch": 2.462003074895673, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.373025417327881, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7137171030044556, + "num_tokens": 580010210.0, + "step": 22419 + }, + { + "epoch": 2.4621128925982867, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.473654270172119, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6833348274230957, + "num_tokens": 580035267.0, + "step": 22420 + }, + { + "epoch": 2.4622227103009005, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.42993426322937, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7225598096847534, + "num_tokens": 580059049.0, + "step": 22421 + }, + { + "epoch": 2.462332528003514, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.405433177947998, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7302733659744263, + "num_tokens": 580083726.0, + "step": 22422 + }, + { + "epoch": 2.462442345706128, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4622883796691895, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7026840448379517, + "num_tokens": 580109057.0, + "step": 22423 + }, + { + "epoch": 2.4625521634087413, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.56857967376709, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7136444449424744, + "num_tokens": 580131527.0, + "step": 22424 + }, + { + "epoch": 2.462661981111355, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.4917404651641846, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7337007522583008, + "num_tokens": 580158296.0, + "step": 22425 + }, + { + "epoch": 2.462771798813969, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4359934329986572, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7067434787750244, + "num_tokens": 580185587.0, + "step": 22426 + }, + { + "epoch": 2.4628816165165826, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.346169948577881, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7158929109573364, + "num_tokens": 580211264.0, + "step": 22427 + }, + { + "epoch": 2.4629914342191963, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3443593978881836, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.728724479675293, + "num_tokens": 580238128.0, + "step": 22428 + }, + { + "epoch": 2.4631012519218096, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.4562573432922363, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7124221324920654, + "num_tokens": 580263380.0, + "step": 22429 + }, + { + "epoch": 2.4632110696244234, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3789801597595215, + "learning_rate": 1e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6823215484619141, + "num_tokens": 580291588.0, + "step": 22430 + }, + { + "epoch": 2.463320887327037, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.300729990005493, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.7033644914627075, + "num_tokens": 580321155.0, + "step": 22431 + }, + { + "epoch": 2.463430705029651, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 7.033205509185791, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7153163552284241, + "num_tokens": 580346786.0, + "step": 22432 + }, + { + "epoch": 2.4635405227322646, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.662785768508911, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7311142683029175, + "num_tokens": 580368009.0, + "step": 22433 + }, + { + "epoch": 2.463650340434878, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6614110469818115, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7282754182815552, + "num_tokens": 580391179.0, + "step": 22434 + }, + { + "epoch": 2.4637601581374917, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4401209354400635, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7084484100341797, + "num_tokens": 580417400.0, + "step": 22435 + }, + { + "epoch": 2.4638699758401055, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5818984508514404, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7175443172454834, + "num_tokens": 580440859.0, + "step": 22436 + }, + { + "epoch": 2.4639797935427192, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4455816745758057, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7109265327453613, + "num_tokens": 580465329.0, + "step": 22437 + }, + { + "epoch": 2.464089611245333, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.229682207107544, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7335244417190552, + "num_tokens": 580494802.0, + "step": 22438 + }, + { + "epoch": 2.4641994289479463, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.350304365158081, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7226269841194153, + "num_tokens": 580520357.0, + "step": 22439 + }, + { + "epoch": 2.46430924665056, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.329434394836426, + "learning_rate": 1e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7550892233848572, + "num_tokens": 580544172.0, + "step": 22440 + }, + { + "epoch": 2.464419064353174, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6928699016571045, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.733851432800293, + "num_tokens": 580565362.0, + "step": 22441 + }, + { + "epoch": 2.4645288820557876, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3893051147460938, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7177557945251465, + "num_tokens": 580592144.0, + "step": 22442 + }, + { + "epoch": 2.464638699758401, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4279961585998535, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7073965072631836, + "num_tokens": 580619409.0, + "step": 22443 + }, + { + "epoch": 2.4647485174610146, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3163259029388428, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7160220146179199, + "num_tokens": 580647254.0, + "step": 22444 + }, + { + "epoch": 2.4648583351636284, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5619025230407715, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7140847444534302, + "num_tokens": 580669445.0, + "step": 22445 + }, + { + "epoch": 2.464968152866242, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2882919311523438, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7246546745300293, + "num_tokens": 580698773.0, + "step": 22446 + }, + { + "epoch": 2.4650779705688555, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.207275152206421, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6948322057723999, + "num_tokens": 580728787.0, + "step": 22447 + }, + { + "epoch": 2.4651877882714692, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.289018392562866, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7139697670936584, + "num_tokens": 580758320.0, + "step": 22448 + }, + { + "epoch": 2.465297605974083, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.196882486343384, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7128896713256836, + "num_tokens": 580787451.0, + "step": 22449 + }, + { + "epoch": 2.4654074236766967, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.2944159507751465, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7217897772789001, + "num_tokens": 580813681.0, + "step": 22450 + }, + { + "epoch": 2.4655172413793105, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.414354085922241, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.7028785943984985, + "num_tokens": 580838606.0, + "step": 22451 + }, + { + "epoch": 2.465627059081924, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.3141376972198486, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7182669639587402, + "num_tokens": 580866549.0, + "step": 22452 + }, + { + "epoch": 2.4657368767845376, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.364424705505371, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7008262276649475, + "num_tokens": 580893306.0, + "step": 22453 + }, + { + "epoch": 2.4658466944871513, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.6796653270721436, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7142340540885925, + "num_tokens": 580916349.0, + "step": 22454 + }, + { + "epoch": 2.465956512189765, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.840749979019165, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7355051040649414, + "num_tokens": 580936238.0, + "step": 22455 + }, + { + "epoch": 2.466066329892379, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.2279789447784424, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6908670663833618, + "num_tokens": 580970218.0, + "step": 22456 + }, + { + "epoch": 2.466176147594992, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.6189048290252686, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7265706062316895, + "num_tokens": 580993275.0, + "step": 22457 + }, + { + "epoch": 2.466285965297606, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.1739139556884766, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7270947098731995, + "num_tokens": 581023208.0, + "step": 22458 + }, + { + "epoch": 2.4663957830002197, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.393939971923828, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7427234649658203, + "num_tokens": 581047242.0, + "step": 22459 + }, + { + "epoch": 2.4665056007028334, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 2.662153959274292, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7317360043525696, + "num_tokens": 581069604.0, + "step": 22460 + }, + { + "epoch": 2.466615418405447, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.560537338256836, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7235735654830933, + "num_tokens": 581093665.0, + "step": 22461 + }, + { + "epoch": 2.4667252361080605, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 2.5040392875671387, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7432409524917603, + "num_tokens": 581117503.0, + "step": 22462 + }, + { + "epoch": 2.4668350538106742, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6372323036193848, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7267069816589355, + "num_tokens": 581141075.0, + "step": 22463 + }, + { + "epoch": 2.466944871513288, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3381800651550293, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7025022506713867, + "num_tokens": 581168850.0, + "step": 22464 + }, + { + "epoch": 2.4670546892159018, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4757285118103027, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7163573503494263, + "num_tokens": 581194901.0, + "step": 22465 + }, + { + "epoch": 2.467164506918515, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.337411642074585, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7092018127441406, + "num_tokens": 581223253.0, + "step": 22466 + }, + { + "epoch": 2.467274324621129, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.192350149154663, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6934876441955566, + "num_tokens": 581257657.0, + "step": 22467 + }, + { + "epoch": 2.4673841423237426, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6749846935272217, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.713837206363678, + "num_tokens": 581280242.0, + "step": 22468 + }, + { + "epoch": 2.4674939600263563, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.36639666557312, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7107583284378052, + "num_tokens": 581309213.0, + "step": 22469 + }, + { + "epoch": 2.4676037777289697, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.84330153465271, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7167454361915588, + "num_tokens": 581328185.0, + "step": 22470 + }, + { + "epoch": 2.4677135954315834, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7420706748962402, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7450007200241089, + "num_tokens": 581348910.0, + "step": 22471 + }, + { + "epoch": 2.467823413134197, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4760968685150146, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7160099148750305, + "num_tokens": 581375189.0, + "step": 22472 + }, + { + "epoch": 2.467933230836811, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6361780166625977, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.73252272605896, + "num_tokens": 581396012.0, + "step": 22473 + }, + { + "epoch": 2.4680430485394247, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.440390110015869, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7158564329147339, + "num_tokens": 581423460.0, + "step": 22474 + }, + { + "epoch": 2.468152866242038, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.525029182434082, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7298791408538818, + "num_tokens": 581449750.0, + "step": 22475 + }, + { + "epoch": 2.4682626839446518, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.570936679840088, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7275283932685852, + "num_tokens": 581470226.0, + "step": 22476 + }, + { + "epoch": 2.4683725016472655, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5172693729400635, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7122206091880798, + "num_tokens": 581496730.0, + "step": 22477 + }, + { + "epoch": 2.4684823193498793, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3939905166625977, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7224969863891602, + "num_tokens": 581520234.0, + "step": 22478 + }, + { + "epoch": 2.468592137052493, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3297266960144043, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7207750678062439, + "num_tokens": 581548145.0, + "step": 22479 + }, + { + "epoch": 2.4687019547551063, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.823859691619873, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7221956849098206, + "num_tokens": 581570008.0, + "step": 22480 + }, + { + "epoch": 2.46881177245772, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.437282085418701, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7288855314254761, + "num_tokens": 581594675.0, + "step": 22481 + }, + { + "epoch": 2.468921590160334, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.344756841659546, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7153905630111694, + "num_tokens": 581621903.0, + "step": 22482 + }, + { + "epoch": 2.4690314078629476, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2812209129333496, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7087803483009338, + "num_tokens": 581653424.0, + "step": 22483 + }, + { + "epoch": 2.4691412255655614, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5044214725494385, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7153845429420471, + "num_tokens": 581680659.0, + "step": 22484 + }, + { + "epoch": 2.4692510432681747, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.415512800216675, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7104448676109314, + "num_tokens": 581706034.0, + "step": 22485 + }, + { + "epoch": 2.4693608609707884, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.194139242172241, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7034117579460144, + "num_tokens": 581736537.0, + "step": 22486 + }, + { + "epoch": 2.469470678673402, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.685903549194336, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7374151945114136, + "num_tokens": 581758212.0, + "step": 22487 + }, + { + "epoch": 2.469580496376016, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8645517826080322, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7263581156730652, + "num_tokens": 581777728.0, + "step": 22488 + }, + { + "epoch": 2.4696903140786297, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.451206922531128, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7464838027954102, + "num_tokens": 581802384.0, + "step": 22489 + }, + { + "epoch": 2.469800131781243, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.299009323120117, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7029865980148315, + "num_tokens": 581831897.0, + "step": 22490 + }, + { + "epoch": 2.469909949483857, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.753716230392456, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7309454679489136, + "num_tokens": 581852383.0, + "step": 22491 + }, + { + "epoch": 2.4700197671864705, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4574368000030518, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7177610397338867, + "num_tokens": 581876337.0, + "step": 22492 + }, + { + "epoch": 2.4701295848890843, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4011573791503906, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6970739364624023, + "num_tokens": 581904782.0, + "step": 22493 + }, + { + "epoch": 2.4702394025916976, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.171165704727173, + "learning_rate": 1e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6766262054443359, + "num_tokens": 581937627.0, + "step": 22494 + }, + { + "epoch": 2.4703492202943114, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.490163564682007, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7136942148208618, + "num_tokens": 581961556.0, + "step": 22495 + }, + { + "epoch": 2.470459037996925, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.608232021331787, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7375768423080444, + "num_tokens": 581982982.0, + "step": 22496 + }, + { + "epoch": 2.470568855699539, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5022096633911133, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7315372228622437, + "num_tokens": 582007050.0, + "step": 22497 + }, + { + "epoch": 2.470678673402152, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.226189613342285, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7206668257713318, + "num_tokens": 582037396.0, + "step": 22498 + }, + { + "epoch": 2.470788491104766, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.590067148208618, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6933433413505554, + "num_tokens": 582060656.0, + "step": 22499 + }, + { + "epoch": 2.4708983088073797, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.054975748062134, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6836009621620178, + "num_tokens": 582098147.0, + "step": 22500 + }, + { + "epoch": 2.4710081265099935, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6391501426696777, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6993957161903381, + "num_tokens": 582120838.0, + "step": 22501 + }, + { + "epoch": 2.471117944212607, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6439619064331055, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7149389982223511, + "num_tokens": 582142587.0, + "step": 22502 + }, + { + "epoch": 2.4712277619152205, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.771815299987793, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7038421630859375, + "num_tokens": 582164094.0, + "step": 22503 + }, + { + "epoch": 2.4713375796178343, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.253591299057007, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7055078148841858, + "num_tokens": 582191897.0, + "step": 22504 + }, + { + "epoch": 2.471447397320448, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5894663333892822, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7143126726150513, + "num_tokens": 582215152.0, + "step": 22505 + }, + { + "epoch": 2.471557215023062, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.821226119995117, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7209069132804871, + "num_tokens": 582235363.0, + "step": 22506 + }, + { + "epoch": 2.4716670327256756, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1206440925598145, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.713366687297821, + "num_tokens": 582268099.0, + "step": 22507 + }, + { + "epoch": 2.471776850428289, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7076377868652344, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.717578649520874, + "num_tokens": 582287577.0, + "step": 22508 + }, + { + "epoch": 2.4718866681309026, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3523879051208496, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7163307070732117, + "num_tokens": 582316791.0, + "step": 22509 + }, + { + "epoch": 2.4719964858335164, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.396683692932129, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7188396453857422, + "num_tokens": 582343571.0, + "step": 22510 + }, + { + "epoch": 2.47210630353613, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.470456838607788, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7111531496047974, + "num_tokens": 582370286.0, + "step": 22511 + }, + { + "epoch": 2.472216121238744, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.246344804763794, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7355690002441406, + "num_tokens": 582396706.0, + "step": 22512 + }, + { + "epoch": 2.472325938941357, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.755382776260376, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7086713314056396, + "num_tokens": 582419388.0, + "step": 22513 + }, + { + "epoch": 2.472435756643971, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2908225059509277, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7014684677124023, + "num_tokens": 582449128.0, + "step": 22514 + }, + { + "epoch": 2.4725455743465847, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2574570178985596, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.713959813117981, + "num_tokens": 582481366.0, + "step": 22515 + }, + { + "epoch": 2.4726553920491985, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.376397132873535, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7006660103797913, + "num_tokens": 582509599.0, + "step": 22516 + }, + { + "epoch": 2.472765209751812, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.449338674545288, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7093806862831116, + "num_tokens": 582537510.0, + "step": 22517 + }, + { + "epoch": 2.4728750274544256, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2575581073760986, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7173668146133423, + "num_tokens": 582565833.0, + "step": 22518 + }, + { + "epoch": 2.4729848451570393, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.230020046234131, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7146270275115967, + "num_tokens": 582595069.0, + "step": 22519 + }, + { + "epoch": 2.473094662859653, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.218459367752075, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7089473605155945, + "num_tokens": 582626140.0, + "step": 22520 + }, + { + "epoch": 2.473204480562267, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1750504970550537, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.723164439201355, + "num_tokens": 582656988.0, + "step": 22521 + }, + { + "epoch": 2.47331429826488, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1673600673675537, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6848547458648682, + "num_tokens": 582690171.0, + "step": 22522 + }, + { + "epoch": 2.473424115967494, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4277119636535645, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7322168350219727, + "num_tokens": 582714233.0, + "step": 22523 + }, + { + "epoch": 2.4735339336701077, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.420665979385376, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7354872226715088, + "num_tokens": 582738901.0, + "step": 22524 + }, + { + "epoch": 2.4736437513727214, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4845614433288574, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.734342098236084, + "num_tokens": 582761168.0, + "step": 22525 + }, + { + "epoch": 2.4737535690753347, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.406221389770508, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7132163047790527, + "num_tokens": 582786137.0, + "step": 22526 + }, + { + "epoch": 2.4738633867779485, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3299481868743896, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7039563059806824, + "num_tokens": 582812937.0, + "step": 22527 + }, + { + "epoch": 2.4739732044805622, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.484623908996582, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7052907943725586, + "num_tokens": 582840990.0, + "step": 22528 + }, + { + "epoch": 2.474083022183176, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.679069757461548, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7278424501419067, + "num_tokens": 582862034.0, + "step": 22529 + }, + { + "epoch": 2.4741928398857898, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.549577474594116, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7112279534339905, + "num_tokens": 582887359.0, + "step": 22530 + }, + { + "epoch": 2.474302657588403, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.387775421142578, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7343065738677979, + "num_tokens": 582911724.0, + "step": 22531 + }, + { + "epoch": 2.474412475291017, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.298839569091797, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7166893482208252, + "num_tokens": 582940079.0, + "step": 22532 + }, + { + "epoch": 2.4745222929936306, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.425414562225342, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7147295475006104, + "num_tokens": 582966645.0, + "step": 22533 + }, + { + "epoch": 2.4746321106962443, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 7.023021221160889, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.71402907371521, + "num_tokens": 582990337.0, + "step": 22534 + }, + { + "epoch": 2.474741928398858, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3534023761749268, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7126520276069641, + "num_tokens": 583019007.0, + "step": 22535 + }, + { + "epoch": 2.4748517461014714, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4754281044006348, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7159867882728577, + "num_tokens": 583046525.0, + "step": 22536 + }, + { + "epoch": 2.474961563804085, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4034111499786377, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6960737109184265, + "num_tokens": 583075457.0, + "step": 22537 + }, + { + "epoch": 2.475071381506699, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.554905414581299, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7165791988372803, + "num_tokens": 583097935.0, + "step": 22538 + }, + { + "epoch": 2.4751811992093127, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8270978927612305, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.727415919303894, + "num_tokens": 583116913.0, + "step": 22539 + }, + { + "epoch": 2.4752910169119264, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.0270659923553467, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7280946969985962, + "num_tokens": 583152270.0, + "step": 22540 + }, + { + "epoch": 2.4754008346145397, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3734664916992188, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7038257718086243, + "num_tokens": 583181986.0, + "step": 22541 + }, + { + "epoch": 2.4755106523171535, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1814308166503906, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7066226005554199, + "num_tokens": 583212767.0, + "step": 22542 + }, + { + "epoch": 2.4756204700197673, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.941519021987915, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7399998307228088, + "num_tokens": 583230748.0, + "step": 22543 + }, + { + "epoch": 2.475730287722381, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7008392810821533, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7391317486763, + "num_tokens": 583250752.0, + "step": 22544 + }, + { + "epoch": 2.4758401054249943, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4218242168426514, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6915680170059204, + "num_tokens": 583278437.0, + "step": 22545 + }, + { + "epoch": 2.475949923127608, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3060355186462402, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6972384452819824, + "num_tokens": 583305175.0, + "step": 22546 + }, + { + "epoch": 2.476059740830222, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7597479820251465, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7140638828277588, + "num_tokens": 583327026.0, + "step": 22547 + }, + { + "epoch": 2.4761695585328356, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2379558086395264, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.704912006855011, + "num_tokens": 583356237.0, + "step": 22548 + }, + { + "epoch": 2.476279376235449, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2431046962738037, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7115309834480286, + "num_tokens": 583384805.0, + "step": 22549 + }, + { + "epoch": 2.4763891939380627, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4153130054473877, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7225208282470703, + "num_tokens": 583409634.0, + "step": 22550 + }, + { + "epoch": 2.4764990116406764, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.281498908996582, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6937839984893799, + "num_tokens": 583439142.0, + "step": 22551 + }, + { + "epoch": 2.47660882934329, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.58754301071167, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7013568878173828, + "num_tokens": 583462208.0, + "step": 22552 + }, + { + "epoch": 2.476718647045904, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4712414741516113, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7210788726806641, + "num_tokens": 583485821.0, + "step": 22553 + }, + { + "epoch": 2.4768284647485173, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2842347621917725, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6938613653182983, + "num_tokens": 583516572.0, + "step": 22554 + }, + { + "epoch": 2.476938282451131, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4568939208984375, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7140570878982544, + "num_tokens": 583542215.0, + "step": 22555 + }, + { + "epoch": 2.4770481001537448, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5106635093688965, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.69752037525177, + "num_tokens": 583568246.0, + "step": 22556 + }, + { + "epoch": 2.4771579178563585, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.531251907348633, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7235412001609802, + "num_tokens": 583593046.0, + "step": 22557 + }, + { + "epoch": 2.4772677355589723, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4982967376708984, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7296668291091919, + "num_tokens": 583617182.0, + "step": 22558 + }, + { + "epoch": 2.4773775532615856, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.576913356781006, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7129761576652527, + "num_tokens": 583640409.0, + "step": 22559 + }, + { + "epoch": 2.4774873709641994, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3818719387054443, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7152413725852966, + "num_tokens": 583667501.0, + "step": 22560 + }, + { + "epoch": 2.477597188666813, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5796210765838623, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.719286322593689, + "num_tokens": 583691981.0, + "step": 22561 + }, + { + "epoch": 2.477707006369427, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1311380863189697, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7126794457435608, + "num_tokens": 583723544.0, + "step": 22562 + }, + { + "epoch": 2.4778168240720406, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6775619983673096, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7186617851257324, + "num_tokens": 583745249.0, + "step": 22563 + }, + { + "epoch": 2.477926641774654, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6222152709960938, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7310043573379517, + "num_tokens": 583767713.0, + "step": 22564 + }, + { + "epoch": 2.4780364594772677, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3871798515319824, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7160184383392334, + "num_tokens": 583796351.0, + "step": 22565 + }, + { + "epoch": 2.4781462771798815, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.720623731613159, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7279400825500488, + "num_tokens": 583818522.0, + "step": 22566 + }, + { + "epoch": 2.478256094882495, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6183292865753174, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.731601357460022, + "num_tokens": 583841306.0, + "step": 22567 + }, + { + "epoch": 2.478365912585109, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6160292625427246, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7417272925376892, + "num_tokens": 583862720.0, + "step": 22568 + }, + { + "epoch": 2.4784757302877223, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3452939987182617, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6814365983009338, + "num_tokens": 583891557.0, + "step": 22569 + }, + { + "epoch": 2.478585547990336, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5521633625030518, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7136066555976868, + "num_tokens": 583915524.0, + "step": 22570 + }, + { + "epoch": 2.47869536569295, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.177781343460083, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7245693206787109, + "num_tokens": 583947311.0, + "step": 22571 + }, + { + "epoch": 2.4788051833955635, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.452181100845337, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7094912528991699, + "num_tokens": 583971538.0, + "step": 22572 + }, + { + "epoch": 2.478915001098177, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.9446120262145996, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7281895875930786, + "num_tokens": 583990662.0, + "step": 22573 + }, + { + "epoch": 2.4790248188007906, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2690954208374023, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7100152969360352, + "num_tokens": 584020069.0, + "step": 22574 + }, + { + "epoch": 2.4791346365034044, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.60168719291687, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7354831695556641, + "num_tokens": 584043942.0, + "step": 22575 + }, + { + "epoch": 2.479244454206018, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.746624231338501, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7168698310852051, + "num_tokens": 584064359.0, + "step": 22576 + }, + { + "epoch": 2.4793542719086314, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5192887783050537, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6988183259963989, + "num_tokens": 584089446.0, + "step": 22577 + }, + { + "epoch": 2.479464089611245, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.570552349090576, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7385952472686768, + "num_tokens": 584112276.0, + "step": 22578 + }, + { + "epoch": 2.479573907313859, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 3.077798843383789, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.702195942401886, + "num_tokens": 584131282.0, + "step": 22579 + }, + { + "epoch": 2.4796837250164727, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4380125999450684, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7073299288749695, + "num_tokens": 584157027.0, + "step": 22580 + }, + { + "epoch": 2.4797935427190865, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.282792091369629, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6871365308761597, + "num_tokens": 584187271.0, + "step": 22581 + }, + { + "epoch": 2.4799033604217, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4710376262664795, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7132126092910767, + "num_tokens": 584214126.0, + "step": 22582 + }, + { + "epoch": 2.4800131781243135, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5411128997802734, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7297988533973694, + "num_tokens": 584238308.0, + "step": 22583 + }, + { + "epoch": 2.4801229958269273, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.644056558609009, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6910114884376526, + "num_tokens": 584260562.0, + "step": 22584 + }, + { + "epoch": 2.480232813529541, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6274054050445557, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7068663239479065, + "num_tokens": 584285600.0, + "step": 22585 + }, + { + "epoch": 2.480342631232155, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4866788387298584, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7196208834648132, + "num_tokens": 584309999.0, + "step": 22586 + }, + { + "epoch": 2.480452448934768, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7826743125915527, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7211396098136902, + "num_tokens": 584331339.0, + "step": 22587 + }, + { + "epoch": 2.480562266637382, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6466705799102783, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7526319622993469, + "num_tokens": 584352018.0, + "step": 22588 + }, + { + "epoch": 2.4806720843399956, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5672976970672607, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7509542107582092, + "num_tokens": 584374314.0, + "step": 22589 + }, + { + "epoch": 2.4807819020426094, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.599491834640503, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6878727674484253, + "num_tokens": 584396902.0, + "step": 22590 + }, + { + "epoch": 2.480891719745223, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5158090591430664, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7290575504302979, + "num_tokens": 584419211.0, + "step": 22591 + }, + { + "epoch": 2.4810015374478365, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6820285320281982, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7162113189697266, + "num_tokens": 584442109.0, + "step": 22592 + }, + { + "epoch": 2.4811113551504502, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6024904251098633, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7207863926887512, + "num_tokens": 584466087.0, + "step": 22593 + }, + { + "epoch": 2.481221172853064, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.467473268508911, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7046521306037903, + "num_tokens": 584490736.0, + "step": 22594 + }, + { + "epoch": 2.4813309905556777, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1402275562286377, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.7100493907928467, + "num_tokens": 584525952.0, + "step": 22595 + }, + { + "epoch": 2.481440808258291, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.313631057739258, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.711190938949585, + "num_tokens": 584554175.0, + "step": 22596 + }, + { + "epoch": 2.481550625960905, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3740947246551514, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7192118167877197, + "num_tokens": 584581818.0, + "step": 22597 + }, + { + "epoch": 2.4816604436635186, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.399332046508789, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7181457281112671, + "num_tokens": 584608837.0, + "step": 22598 + }, + { + "epoch": 2.4817702613661323, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.359299659729004, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.698731541633606, + "num_tokens": 584637585.0, + "step": 22599 + }, + { + "epoch": 2.4818800790687456, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.448432683944702, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7171111702919006, + "num_tokens": 584661031.0, + "step": 22600 + }, + { + "epoch": 2.4819898967713594, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.457174062728882, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7241575121879578, + "num_tokens": 584684536.0, + "step": 22601 + }, + { + "epoch": 2.482099714473973, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.583400011062622, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7278742790222168, + "num_tokens": 584708199.0, + "step": 22602 + }, + { + "epoch": 2.482209532176587, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.479264497756958, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.695600152015686, + "num_tokens": 584733161.0, + "step": 22603 + }, + { + "epoch": 2.4823193498792007, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.494544744491577, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7284274697303772, + "num_tokens": 584757816.0, + "step": 22604 + }, + { + "epoch": 2.482429167581814, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.260690927505493, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.731934666633606, + "num_tokens": 584787031.0, + "step": 22605 + }, + { + "epoch": 2.4825389852844277, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6327741146087646, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7319652438163757, + "num_tokens": 584808485.0, + "step": 22606 + }, + { + "epoch": 2.4826488029870415, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8124196529388428, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7195566296577454, + "num_tokens": 584830143.0, + "step": 22607 + }, + { + "epoch": 2.4827586206896552, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.535903215408325, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7135545611381531, + "num_tokens": 584854495.0, + "step": 22608 + }, + { + "epoch": 2.482868438392269, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6127190589904785, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7140932083129883, + "num_tokens": 584877030.0, + "step": 22609 + }, + { + "epoch": 2.4829782560948823, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.550246000289917, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7441425323486328, + "num_tokens": 584899034.0, + "step": 22610 + }, + { + "epoch": 2.483088073797496, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.522183895111084, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7097734808921814, + "num_tokens": 584924205.0, + "step": 22611 + }, + { + "epoch": 2.48319789150011, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4672131538391113, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6955622434616089, + "num_tokens": 584949529.0, + "step": 22612 + }, + { + "epoch": 2.4833077092027236, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7109763622283936, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7195873260498047, + "num_tokens": 584972765.0, + "step": 22613 + }, + { + "epoch": 2.4834175269053373, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1764261722564697, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6948823928833008, + "num_tokens": 585004051.0, + "step": 22614 + }, + { + "epoch": 2.4835273446079507, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.149355411529541, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6847379207611084, + "num_tokens": 585035043.0, + "step": 22615 + }, + { + "epoch": 2.4836371623105644, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.352201223373413, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7181060314178467, + "num_tokens": 585063833.0, + "step": 22616 + }, + { + "epoch": 2.483746980013178, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.487351894378662, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6986616849899292, + "num_tokens": 585089772.0, + "step": 22617 + }, + { + "epoch": 2.483856797715792, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.406803846359253, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.701005756855011, + "num_tokens": 585113862.0, + "step": 22618 + }, + { + "epoch": 2.4839666154184057, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.287200689315796, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7096141576766968, + "num_tokens": 585141848.0, + "step": 22619 + }, + { + "epoch": 2.484076433121019, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5469324588775635, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7124183177947998, + "num_tokens": 585165372.0, + "step": 22620 + }, + { + "epoch": 2.4841862508236328, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.466752290725708, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7061834335327148, + "num_tokens": 585190186.0, + "step": 22621 + }, + { + "epoch": 2.4842960685262465, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4379920959472656, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7273050546646118, + "num_tokens": 585215818.0, + "step": 22622 + }, + { + "epoch": 2.4844058862288603, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.445669412612915, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7364711165428162, + "num_tokens": 585238265.0, + "step": 22623 + }, + { + "epoch": 2.4845157039314736, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.266878843307495, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.728268563747406, + "num_tokens": 585266467.0, + "step": 22624 + }, + { + "epoch": 2.4846255216340873, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.231107711791992, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7270674109458923, + "num_tokens": 585295652.0, + "step": 22625 + }, + { + "epoch": 2.484735339336701, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2813665866851807, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7280081510543823, + "num_tokens": 585323200.0, + "step": 22626 + }, + { + "epoch": 2.484845157039315, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5216171741485596, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7198153734207153, + "num_tokens": 585347754.0, + "step": 22627 + }, + { + "epoch": 2.484954974741928, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3792245388031006, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7171673774719238, + "num_tokens": 585373003.0, + "step": 22628 + }, + { + "epoch": 2.485064792444542, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3165109157562256, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7089157104492188, + "num_tokens": 585399580.0, + "step": 22629 + }, + { + "epoch": 2.4851746101471557, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.511059045791626, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7250217795372009, + "num_tokens": 585423217.0, + "step": 22630 + }, + { + "epoch": 2.4852844278497694, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3460211753845215, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7115556001663208, + "num_tokens": 585451117.0, + "step": 22631 + }, + { + "epoch": 2.485394245552383, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.244511365890503, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7152285575866699, + "num_tokens": 585480424.0, + "step": 22632 + }, + { + "epoch": 2.4855040632549965, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3480513095855713, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7274734377861023, + "num_tokens": 585508401.0, + "step": 22633 + }, + { + "epoch": 2.4856138809576103, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3402631282806396, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.701643168926239, + "num_tokens": 585537066.0, + "step": 22634 + }, + { + "epoch": 2.485723698660224, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.308901309967041, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7077063322067261, + "num_tokens": 585564235.0, + "step": 22635 + }, + { + "epoch": 2.485833516362838, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7639098167419434, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.722801685333252, + "num_tokens": 585585897.0, + "step": 22636 + }, + { + "epoch": 2.4859433340654515, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.369539499282837, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7127317190170288, + "num_tokens": 585613902.0, + "step": 22637 + }, + { + "epoch": 2.486053151768065, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.517766237258911, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7146196365356445, + "num_tokens": 585640191.0, + "step": 22638 + }, + { + "epoch": 2.4861629694706786, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.531146287918091, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7130120396614075, + "num_tokens": 585668203.0, + "step": 22639 + }, + { + "epoch": 2.4862727871732924, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6012909412384033, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.719626784324646, + "num_tokens": 585692082.0, + "step": 22640 + }, + { + "epoch": 2.486382604875906, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5744335651397705, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7370812296867371, + "num_tokens": 585713766.0, + "step": 22641 + }, + { + "epoch": 2.48649242257852, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4090158939361572, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7332534790039062, + "num_tokens": 585739024.0, + "step": 22642 + }, + { + "epoch": 2.486602240281133, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2647438049316406, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7078045606613159, + "num_tokens": 585768397.0, + "step": 22643 + }, + { + "epoch": 2.486712057983747, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.664745569229126, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7161686420440674, + "num_tokens": 585790832.0, + "step": 22644 + }, + { + "epoch": 2.4868218756863607, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.457620620727539, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7026954889297485, + "num_tokens": 585816244.0, + "step": 22645 + }, + { + "epoch": 2.4869316933889745, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.472142457962036, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7300722599029541, + "num_tokens": 585841796.0, + "step": 22646 + }, + { + "epoch": 2.4870415110915878, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.35760498046875, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6925342082977295, + "num_tokens": 585868691.0, + "step": 22647 + }, + { + "epoch": 2.4871513287942015, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.337867259979248, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7076618671417236, + "num_tokens": 585896525.0, + "step": 22648 + }, + { + "epoch": 2.4872611464968153, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.381528854370117, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7126738429069519, + "num_tokens": 585922581.0, + "step": 22649 + }, + { + "epoch": 2.487370964199429, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.301417350769043, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7239739298820496, + "num_tokens": 585952214.0, + "step": 22650 + }, + { + "epoch": 2.487480781902043, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2944138050079346, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7165544629096985, + "num_tokens": 585981483.0, + "step": 22651 + }, + { + "epoch": 2.487590599604656, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.405616283416748, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7164428234100342, + "num_tokens": 586006401.0, + "step": 22652 + }, + { + "epoch": 2.48770041730727, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2863359451293945, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.7014024257659912, + "num_tokens": 586035868.0, + "step": 22653 + }, + { + "epoch": 2.4878102350098836, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.432445764541626, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7051301002502441, + "num_tokens": 586063210.0, + "step": 22654 + }, + { + "epoch": 2.4879200527124974, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3952479362487793, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.73255854845047, + "num_tokens": 586088749.0, + "step": 22655 + }, + { + "epoch": 2.4880298704151107, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4433276653289795, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.721785843372345, + "num_tokens": 586114721.0, + "step": 22656 + }, + { + "epoch": 2.4881396881177245, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.535057544708252, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7283033132553101, + "num_tokens": 586138740.0, + "step": 22657 + }, + { + "epoch": 2.488249505820338, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4719808101654053, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7074450254440308, + "num_tokens": 586163806.0, + "step": 22658 + }, + { + "epoch": 2.488359323522952, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3346619606018066, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6996505856513977, + "num_tokens": 586189894.0, + "step": 22659 + }, + { + "epoch": 2.4884691412255657, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.155348062515259, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7090839147567749, + "num_tokens": 586218294.0, + "step": 22660 + }, + { + "epoch": 2.488578958928179, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5975377559661865, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.716910719871521, + "num_tokens": 586241428.0, + "step": 22661 + }, + { + "epoch": 2.488688776630793, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7625434398651123, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7298716306686401, + "num_tokens": 586259610.0, + "step": 22662 + }, + { + "epoch": 2.4887985943334066, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.390929698944092, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7013741731643677, + "num_tokens": 586287264.0, + "step": 22663 + }, + { + "epoch": 2.4889084120360203, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.314100503921509, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7279661297798157, + "num_tokens": 586315879.0, + "step": 22664 + }, + { + "epoch": 2.489018229738634, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.779735565185547, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.7019144296646118, + "num_tokens": 586336976.0, + "step": 22665 + }, + { + "epoch": 2.4891280474412474, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1327500343322754, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6983722448348999, + "num_tokens": 586369199.0, + "step": 22666 + }, + { + "epoch": 2.489237865143861, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.895974636077881, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7195348739624023, + "num_tokens": 586387568.0, + "step": 22667 + }, + { + "epoch": 2.489347682846475, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.397189140319824, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7073538303375244, + "num_tokens": 586415242.0, + "step": 22668 + }, + { + "epoch": 2.4894575005490887, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.404106855392456, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7193881869316101, + "num_tokens": 586441756.0, + "step": 22669 + }, + { + "epoch": 2.4895673182517024, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3143930435180664, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7250975370407104, + "num_tokens": 586468933.0, + "step": 22670 + }, + { + "epoch": 2.4896771359543157, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4660162925720215, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.7019587755203247, + "num_tokens": 586495660.0, + "step": 22671 + }, + { + "epoch": 2.4897869536569295, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3462753295898438, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7057170867919922, + "num_tokens": 586522449.0, + "step": 22672 + }, + { + "epoch": 2.4898967713595432, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 8.488232612609863, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7484003901481628, + "num_tokens": 586549179.0, + "step": 22673 + }, + { + "epoch": 2.490006589062157, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4521331787109375, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7165971994400024, + "num_tokens": 586574623.0, + "step": 22674 + }, + { + "epoch": 2.4901164067647703, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.555955648422241, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7047176957130432, + "num_tokens": 586600188.0, + "step": 22675 + }, + { + "epoch": 2.490226224467384, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5294690132141113, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7140320539474487, + "num_tokens": 586623978.0, + "step": 22676 + }, + { + "epoch": 2.490336042169998, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.614539384841919, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7031331658363342, + "num_tokens": 586647401.0, + "step": 22677 + }, + { + "epoch": 2.4904458598726116, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8432459831237793, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7060940265655518, + "num_tokens": 586668775.0, + "step": 22678 + }, + { + "epoch": 2.490555677575225, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.480268716812134, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7210857272148132, + "num_tokens": 586694421.0, + "step": 22679 + }, + { + "epoch": 2.4906654952778386, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3726108074188232, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7095210552215576, + "num_tokens": 586721979.0, + "step": 22680 + }, + { + "epoch": 2.4907753129804524, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3431291580200195, + "learning_rate": 1e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.6765666007995605, + "num_tokens": 586752442.0, + "step": 22681 + }, + { + "epoch": 2.490885130683066, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3812599182128906, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6934405565261841, + "num_tokens": 586780628.0, + "step": 22682 + }, + { + "epoch": 2.49099494838568, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.162196636199951, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.731214165687561, + "num_tokens": 586809136.0, + "step": 22683 + }, + { + "epoch": 2.4911047660882932, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2718796730041504, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7123650312423706, + "num_tokens": 586838550.0, + "step": 22684 + }, + { + "epoch": 2.491214583790907, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.175873041152954, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6812126636505127, + "num_tokens": 586872032.0, + "step": 22685 + }, + { + "epoch": 2.4913244014935207, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2016804218292236, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7040015459060669, + "num_tokens": 586906818.0, + "step": 22686 + }, + { + "epoch": 2.4914342191961345, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8298606872558594, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7225900888442993, + "num_tokens": 586928153.0, + "step": 22687 + }, + { + "epoch": 2.4915440368987483, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4247889518737793, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7041590213775635, + "num_tokens": 586954338.0, + "step": 22688 + }, + { + "epoch": 2.4916538546013616, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.443021059036255, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7077688574790955, + "num_tokens": 586981709.0, + "step": 22689 + }, + { + "epoch": 2.4917636723039753, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4185101985931396, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7315143346786499, + "num_tokens": 587009605.0, + "step": 22690 + }, + { + "epoch": 2.491873490006589, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.647907257080078, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7262004017829895, + "num_tokens": 587032841.0, + "step": 22691 + }, + { + "epoch": 2.491983307709203, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.619277000427246, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7158951163291931, + "num_tokens": 587056958.0, + "step": 22692 + }, + { + "epoch": 2.4920931254118166, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.402132272720337, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7184997797012329, + "num_tokens": 587083782.0, + "step": 22693 + }, + { + "epoch": 2.49220294311443, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.465568780899048, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.693294882774353, + "num_tokens": 587110823.0, + "step": 22694 + }, + { + "epoch": 2.4923127608170437, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2886648178100586, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.698406457901001, + "num_tokens": 587140248.0, + "step": 22695 + }, + { + "epoch": 2.4924225785196574, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.405331611633301, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6873555183410645, + "num_tokens": 587169688.0, + "step": 22696 + }, + { + "epoch": 2.492532396222271, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6165874004364014, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7258192896842957, + "num_tokens": 587192285.0, + "step": 22697 + }, + { + "epoch": 2.4926422139248845, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.544919729232788, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7291390299797058, + "num_tokens": 587213982.0, + "step": 22698 + }, + { + "epoch": 2.4927520316274983, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.393158435821533, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7179557681083679, + "num_tokens": 587241666.0, + "step": 22699 + }, + { + "epoch": 2.492861849330112, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6263186931610107, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7219409942626953, + "num_tokens": 587265561.0, + "step": 22700 + }, + { + "epoch": 2.4929716670327258, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.329691171646118, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6947898864746094, + "num_tokens": 587295396.0, + "step": 22701 + }, + { + "epoch": 2.4930814847353395, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3947460651397705, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7382298707962036, + "num_tokens": 587321031.0, + "step": 22702 + }, + { + "epoch": 2.493191302437953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 32.3806037902832, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7245758771896362, + "num_tokens": 587339850.0, + "step": 22703 + }, + { + "epoch": 2.4933011201405666, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.25754976272583, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6864654421806335, + "num_tokens": 587369105.0, + "step": 22704 + }, + { + "epoch": 2.4934109378431804, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4435153007507324, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6961575150489807, + "num_tokens": 587397579.0, + "step": 22705 + }, + { + "epoch": 2.493520755545794, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4507946968078613, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.718578040599823, + "num_tokens": 587423378.0, + "step": 22706 + }, + { + "epoch": 2.4936305732484074, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6182188987731934, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7164895534515381, + "num_tokens": 587446342.0, + "step": 22707 + }, + { + "epoch": 2.493740390951021, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2035553455352783, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7070078253746033, + "num_tokens": 587476983.0, + "step": 22708 + }, + { + "epoch": 2.493850208653635, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8996338844299316, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7161083221435547, + "num_tokens": 587501236.0, + "step": 22709 + }, + { + "epoch": 2.4939600263562487, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.817138671875, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.716119110584259, + "num_tokens": 587521020.0, + "step": 22710 + }, + { + "epoch": 2.4940698440588625, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.369969367980957, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7099939584732056, + "num_tokens": 587547275.0, + "step": 22711 + }, + { + "epoch": 2.4941796617614758, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4568512439727783, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7245467901229858, + "num_tokens": 587573084.0, + "step": 22712 + }, + { + "epoch": 2.4942894794640895, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2637743949890137, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7065622210502625, + "num_tokens": 587603257.0, + "step": 22713 + }, + { + "epoch": 2.4943992971667033, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3988120555877686, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7089284658432007, + "num_tokens": 587630187.0, + "step": 22714 + }, + { + "epoch": 2.494509114869317, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8713412284851074, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7158544659614563, + "num_tokens": 587649842.0, + "step": 22715 + }, + { + "epoch": 2.494618932571931, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.351459503173828, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7104096412658691, + "num_tokens": 587675550.0, + "step": 22716 + }, + { + "epoch": 2.494728750274544, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.572556972503662, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7195927500724792, + "num_tokens": 587698137.0, + "step": 22717 + }, + { + "epoch": 2.494838567977158, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.411778450012207, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7191891670227051, + "num_tokens": 587726673.0, + "step": 22718 + }, + { + "epoch": 2.4949483856797716, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2566633224487305, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7177273035049438, + "num_tokens": 587756002.0, + "step": 22719 + }, + { + "epoch": 2.4950582033823854, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.398338794708252, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7032138705253601, + "num_tokens": 587784302.0, + "step": 22720 + }, + { + "epoch": 2.495168021084999, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3302230834960938, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.715785026550293, + "num_tokens": 587813162.0, + "step": 22721 + }, + { + "epoch": 2.4952778387876124, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.228916645050049, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7287178635597229, + "num_tokens": 587844242.0, + "step": 22722 + }, + { + "epoch": 2.495387656490226, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.344376802444458, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7029604911804199, + "num_tokens": 587872383.0, + "step": 22723 + }, + { + "epoch": 2.49549747419284, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.598564386367798, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7127350568771362, + "num_tokens": 587895424.0, + "step": 22724 + }, + { + "epoch": 2.4956072918954537, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.08876895904541, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7135315537452698, + "num_tokens": 587929377.0, + "step": 22725 + }, + { + "epoch": 2.495717109598067, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.551982879638672, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7208060026168823, + "num_tokens": 587952057.0, + "step": 22726 + }, + { + "epoch": 2.495826927300681, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.511815071105957, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7398742437362671, + "num_tokens": 587973851.0, + "step": 22727 + }, + { + "epoch": 2.4959367450032945, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4144740104675293, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7182564735412598, + "num_tokens": 588000156.0, + "step": 22728 + }, + { + "epoch": 2.4960465627059083, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3085854053497314, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7175426483154297, + "num_tokens": 588027864.0, + "step": 22729 + }, + { + "epoch": 2.4961563804085216, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5555238723754883, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6913328170776367, + "num_tokens": 588053574.0, + "step": 22730 + }, + { + "epoch": 2.4962661981111354, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.19777512550354, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.726657509803772, + "num_tokens": 588083717.0, + "step": 22731 + }, + { + "epoch": 2.496376015813749, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6488373279571533, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7169783115386963, + "num_tokens": 588106376.0, + "step": 22732 + }, + { + "epoch": 2.496485833516363, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4413576126098633, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7158121466636658, + "num_tokens": 588134677.0, + "step": 22733 + }, + { + "epoch": 2.4965956512189766, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.239210844039917, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7043024301528931, + "num_tokens": 588163054.0, + "step": 22734 + }, + { + "epoch": 2.49670546892159, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2751052379608154, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7100273370742798, + "num_tokens": 588192624.0, + "step": 22735 + }, + { + "epoch": 2.4968152866242037, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.440535545349121, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7090885639190674, + "num_tokens": 588217520.0, + "step": 22736 + }, + { + "epoch": 2.4969251043268175, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3502211570739746, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7021766901016235, + "num_tokens": 588246847.0, + "step": 22737 + }, + { + "epoch": 2.4970349220294312, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6326045989990234, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7117855548858643, + "num_tokens": 588271089.0, + "step": 22738 + }, + { + "epoch": 2.497144739732045, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3565168380737305, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.711558997631073, + "num_tokens": 588296509.0, + "step": 22739 + }, + { + "epoch": 2.4972545574346583, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6561343669891357, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7090328931808472, + "num_tokens": 588320644.0, + "step": 22740 + }, + { + "epoch": 2.497364375137272, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.141218900680542, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7209038138389587, + "num_tokens": 588351365.0, + "step": 22741 + }, + { + "epoch": 2.497474192839886, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5115725994110107, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7181469202041626, + "num_tokens": 588374301.0, + "step": 22742 + }, + { + "epoch": 2.4975840105424996, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6639397144317627, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.720794677734375, + "num_tokens": 588394979.0, + "step": 22743 + }, + { + "epoch": 2.4976938282451133, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.614734649658203, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7103996872901917, + "num_tokens": 588418678.0, + "step": 22744 + }, + { + "epoch": 2.4978036459477266, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.484830617904663, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7057914137840271, + "num_tokens": 588442045.0, + "step": 22745 + }, + { + "epoch": 2.4979134636503404, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6301262378692627, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7171351313591003, + "num_tokens": 588464154.0, + "step": 22746 + }, + { + "epoch": 2.498023281352954, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.406890869140625, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7296578884124756, + "num_tokens": 588490177.0, + "step": 22747 + }, + { + "epoch": 2.498133099055568, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4785983562469482, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7056573629379272, + "num_tokens": 588514830.0, + "step": 22748 + }, + { + "epoch": 2.4982429167581817, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.853512763977051, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7312413454055786, + "num_tokens": 588533952.0, + "step": 22749 + }, + { + "epoch": 2.498352734460795, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.837092161178589, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7141745686531067, + "num_tokens": 588553512.0, + "step": 22750 + }, + { + "epoch": 2.4984625521634087, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.856107711791992, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7442034482955933, + "num_tokens": 588570616.0, + "step": 22751 + }, + { + "epoch": 2.4985723698660225, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4642586708068848, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7054430842399597, + "num_tokens": 588595651.0, + "step": 22752 + }, + { + "epoch": 2.4986821875686362, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6168015003204346, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7320080995559692, + "num_tokens": 588617392.0, + "step": 22753 + }, + { + "epoch": 2.4987920052712496, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.246438980102539, + "learning_rate": 1e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.685880184173584, + "num_tokens": 588649690.0, + "step": 22754 + }, + { + "epoch": 2.4989018229738633, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3532943725585938, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7068167328834534, + "num_tokens": 588680395.0, + "step": 22755 + }, + { + "epoch": 2.499011640676477, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.0717287063598633, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7010306119918823, + "num_tokens": 588715486.0, + "step": 22756 + }, + { + "epoch": 2.499121458379091, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.576359987258911, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7025430202484131, + "num_tokens": 588741555.0, + "step": 22757 + }, + { + "epoch": 2.499231276081704, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2057082653045654, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7162173986434937, + "num_tokens": 588771413.0, + "step": 22758 + }, + { + "epoch": 2.499341093784318, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6844053268432617, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7394617795944214, + "num_tokens": 588791492.0, + "step": 22759 + }, + { + "epoch": 2.4994509114869317, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.9205222129821777, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7229985594749451, + "num_tokens": 588811461.0, + "step": 22760 + }, + { + "epoch": 2.4995607291895454, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4045867919921875, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7035140991210938, + "num_tokens": 588840279.0, + "step": 22761 + }, + { + "epoch": 2.499670546892159, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.405245304107666, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7175477147102356, + "num_tokens": 588866667.0, + "step": 22762 + }, + { + "epoch": 2.4997803645947725, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.301054000854492, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.708602249622345, + "num_tokens": 588894255.0, + "step": 22763 + }, + { + "epoch": 2.4998901822973862, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.433661937713623, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7095068097114563, + "num_tokens": 588921334.0, + "step": 22764 + }, + { + "epoch": 2.5, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.705026626586914, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7097200155258179, + "num_tokens": 588942704.0, + "step": 22765 + }, + { + "epoch": 2.5001098177026138, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2776594161987305, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7041106224060059, + "num_tokens": 588971823.0, + "step": 22766 + }, + { + "epoch": 2.5002196354052275, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.624011993408203, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7289136052131653, + "num_tokens": 588993280.0, + "step": 22767 + }, + { + "epoch": 2.500329453107841, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5941591262817383, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.7015296220779419, + "num_tokens": 589016288.0, + "step": 22768 + }, + { + "epoch": 2.5004392708104546, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.7727437019348145, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7166028618812561, + "num_tokens": 589042750.0, + "step": 22769 + }, + { + "epoch": 2.5005490885130683, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.541097402572632, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6986082792282104, + "num_tokens": 589068047.0, + "step": 22770 + }, + { + "epoch": 2.500658906215682, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5931894779205322, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7242380380630493, + "num_tokens": 589090217.0, + "step": 22771 + }, + { + "epoch": 2.500768723918296, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2993876934051514, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7116405963897705, + "num_tokens": 589120073.0, + "step": 22772 + }, + { + "epoch": 2.500878541620909, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4746108055114746, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7357483506202698, + "num_tokens": 589144251.0, + "step": 22773 + }, + { + "epoch": 2.500988359323523, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.501892328262329, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7125714421272278, + "num_tokens": 589168790.0, + "step": 22774 + }, + { + "epoch": 2.5010981770261367, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.399235725402832, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7264134883880615, + "num_tokens": 589193331.0, + "step": 22775 + }, + { + "epoch": 2.5012079947287504, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.294797420501709, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7116102576255798, + "num_tokens": 589221114.0, + "step": 22776 + }, + { + "epoch": 2.501317812431364, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3670406341552734, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7445390224456787, + "num_tokens": 589246004.0, + "step": 22777 + }, + { + "epoch": 2.5014276301339775, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.9594643115997314, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.714298665523529, + "num_tokens": 589265631.0, + "step": 22778 + }, + { + "epoch": 2.5015374478365913, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.639430046081543, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7397009134292603, + "num_tokens": 589287728.0, + "step": 22779 + }, + { + "epoch": 2.501647265539205, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.402114152908325, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7132381796836853, + "num_tokens": 589313655.0, + "step": 22780 + }, + { + "epoch": 2.5017570832418183, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5314509868621826, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7169889807701111, + "num_tokens": 589337141.0, + "step": 22781 + }, + { + "epoch": 2.501866900944432, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5706093311309814, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7149643898010254, + "num_tokens": 589359967.0, + "step": 22782 + }, + { + "epoch": 2.501976718647046, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.243738889694214, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7399563789367676, + "num_tokens": 589387337.0, + "step": 22783 + }, + { + "epoch": 2.5020865363496596, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4264492988586426, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7121765613555908, + "num_tokens": 589414927.0, + "step": 22784 + }, + { + "epoch": 2.5021963540522734, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7920632362365723, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7231783866882324, + "num_tokens": 589436169.0, + "step": 22785 + }, + { + "epoch": 2.5023061717548867, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.318391799926758, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.7107467651367188, + "num_tokens": 589464166.0, + "step": 22786 + }, + { + "epoch": 2.5024159894575004, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2156972885131836, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6966044902801514, + "num_tokens": 589495499.0, + "step": 22787 + }, + { + "epoch": 2.502525807160114, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3504531383514404, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7075932025909424, + "num_tokens": 589522298.0, + "step": 22788 + }, + { + "epoch": 2.502635624862728, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.771890878677368, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7265787720680237, + "num_tokens": 589542211.0, + "step": 22789 + }, + { + "epoch": 2.5027454425653417, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.753795623779297, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6969560384750366, + "num_tokens": 589563880.0, + "step": 22790 + }, + { + "epoch": 2.502855260267955, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.299368143081665, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7556729316711426, + "num_tokens": 589590574.0, + "step": 22791 + }, + { + "epoch": 2.5029650779705688, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.292121171951294, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7232211232185364, + "num_tokens": 589619617.0, + "step": 22792 + }, + { + "epoch": 2.5030748956731825, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.726846694946289, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7049301266670227, + "num_tokens": 589640033.0, + "step": 22793 + }, + { + "epoch": 2.5031847133757963, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3945930004119873, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7301012277603149, + "num_tokens": 589665609.0, + "step": 22794 + }, + { + "epoch": 2.50329453107841, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6093544960021973, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7257291674613953, + "num_tokens": 589687734.0, + "step": 22795 + }, + { + "epoch": 2.5034043487810234, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.734057664871216, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.743400514125824, + "num_tokens": 589707580.0, + "step": 22796 + }, + { + "epoch": 2.503514166483637, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3205692768096924, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7474954128265381, + "num_tokens": 589734307.0, + "step": 22797 + }, + { + "epoch": 2.503623984186251, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5570027828216553, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7210954427719116, + "num_tokens": 589757196.0, + "step": 22798 + }, + { + "epoch": 2.5037338018888646, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.452425718307495, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7259671092033386, + "num_tokens": 589780928.0, + "step": 22799 + }, + { + "epoch": 2.5038436195914784, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.24871563911438, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7241394519805908, + "num_tokens": 589810612.0, + "step": 22800 + }, + { + "epoch": 2.5039534372940917, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.669191360473633, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7144132852554321, + "num_tokens": 589831931.0, + "step": 22801 + }, + { + "epoch": 2.5040632549967055, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3556602001190186, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7078206539154053, + "num_tokens": 589861413.0, + "step": 22802 + }, + { + "epoch": 2.504173072699319, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.269545316696167, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7107220888137817, + "num_tokens": 589891007.0, + "step": 22803 + }, + { + "epoch": 2.5042828904019325, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4364960193634033, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7088256478309631, + "num_tokens": 589918620.0, + "step": 22804 + }, + { + "epoch": 2.5043927081045467, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3746917247772217, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7151498794555664, + "num_tokens": 589946586.0, + "step": 22805 + }, + { + "epoch": 2.50450252580716, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.307157516479492, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7231745719909668, + "num_tokens": 589974246.0, + "step": 22806 + }, + { + "epoch": 2.504612343509774, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3204829692840576, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7098001837730408, + "num_tokens": 590002770.0, + "step": 22807 + }, + { + "epoch": 2.5047221612123876, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5036585330963135, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7132006287574768, + "num_tokens": 590027131.0, + "step": 22808 + }, + { + "epoch": 2.504831978915001, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8967926502227783, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7142124176025391, + "num_tokens": 590046283.0, + "step": 22809 + }, + { + "epoch": 2.5049417966176146, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.315997838973999, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6912068724632263, + "num_tokens": 590076302.0, + "step": 22810 + }, + { + "epoch": 2.5050516143202284, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.408353090286255, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.700237512588501, + "num_tokens": 590102808.0, + "step": 22811 + }, + { + "epoch": 2.505161432022842, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4018702507019043, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7128848433494568, + "num_tokens": 590128825.0, + "step": 22812 + }, + { + "epoch": 2.505271249725456, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.355187177658081, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7254807949066162, + "num_tokens": 590153176.0, + "step": 22813 + }, + { + "epoch": 2.505381067428069, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.375688076019287, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7167252898216248, + "num_tokens": 590179251.0, + "step": 22814 + }, + { + "epoch": 2.505490885130683, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.715703248977661, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.720710039138794, + "num_tokens": 590199334.0, + "step": 22815 + }, + { + "epoch": 2.5056007028332967, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.206711530685425, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7132877707481384, + "num_tokens": 590230521.0, + "step": 22816 + }, + { + "epoch": 2.5057105205359105, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4887566566467285, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7016153335571289, + "num_tokens": 590256396.0, + "step": 22817 + }, + { + "epoch": 2.5058203382385242, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.511183500289917, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7133185863494873, + "num_tokens": 590280641.0, + "step": 22818 + }, + { + "epoch": 2.5059301559411375, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4103002548217773, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7016409635543823, + "num_tokens": 590307933.0, + "step": 22819 + }, + { + "epoch": 2.5060399736437513, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.642777681350708, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7035804986953735, + "num_tokens": 590328960.0, + "step": 22820 + }, + { + "epoch": 2.506149791346365, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.50277042388916, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7334027290344238, + "num_tokens": 590350891.0, + "step": 22821 + }, + { + "epoch": 2.506259609048979, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.698503017425537, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7292851805686951, + "num_tokens": 590371449.0, + "step": 22822 + }, + { + "epoch": 2.5063694267515926, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6598803997039795, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7487643361091614, + "num_tokens": 590393247.0, + "step": 22823 + }, + { + "epoch": 2.506479244454206, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6965153217315674, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7133716344833374, + "num_tokens": 590416572.0, + "step": 22824 + }, + { + "epoch": 2.5065890621568196, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5420517921447754, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7278090715408325, + "num_tokens": 590442330.0, + "step": 22825 + }, + { + "epoch": 2.5066988798594334, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3528475761413574, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6950002312660217, + "num_tokens": 590474719.0, + "step": 22826 + }, + { + "epoch": 2.506808697562047, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2358131408691406, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7546181082725525, + "num_tokens": 590502613.0, + "step": 22827 + }, + { + "epoch": 2.506918515264661, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4981608390808105, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.715505838394165, + "num_tokens": 590528490.0, + "step": 22828 + }, + { + "epoch": 2.5070283329672742, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.402672290802002, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6949551105499268, + "num_tokens": 590556084.0, + "step": 22829 + }, + { + "epoch": 2.507138150669888, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5715277194976807, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7139323949813843, + "num_tokens": 590579798.0, + "step": 22830 + }, + { + "epoch": 2.5072479683725017, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.669581174850464, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7177652716636658, + "num_tokens": 590601789.0, + "step": 22831 + }, + { + "epoch": 2.507357786075115, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.653165817260742, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7147412300109863, + "num_tokens": 590626173.0, + "step": 22832 + }, + { + "epoch": 2.507467603777729, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5015869140625, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7058970928192139, + "num_tokens": 590653382.0, + "step": 22833 + }, + { + "epoch": 2.5075774214803426, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4046080112457275, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7366963624954224, + "num_tokens": 590681835.0, + "step": 22834 + }, + { + "epoch": 2.5076872391829563, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.621795177459717, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7263979911804199, + "num_tokens": 590703115.0, + "step": 22835 + }, + { + "epoch": 2.50779705688557, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3187127113342285, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7266342639923096, + "num_tokens": 590729143.0, + "step": 22836 + }, + { + "epoch": 2.5079068745881834, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3287484645843506, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7004832625389099, + "num_tokens": 590757784.0, + "step": 22837 + }, + { + "epoch": 2.508016692290797, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.885894536972046, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7272331714630127, + "num_tokens": 590777166.0, + "step": 22838 + }, + { + "epoch": 2.508126509993411, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5233068466186523, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7441800832748413, + "num_tokens": 590801084.0, + "step": 22839 + }, + { + "epoch": 2.5082363276960247, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.756035327911377, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7185617685317993, + "num_tokens": 590823667.0, + "step": 22840 + }, + { + "epoch": 2.5083461453986384, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3103723526000977, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6922760009765625, + "num_tokens": 590853423.0, + "step": 22841 + }, + { + "epoch": 2.5084559631012517, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.68912672996521, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7174432873725891, + "num_tokens": 590876373.0, + "step": 22842 + }, + { + "epoch": 2.5085657808038655, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.575042247772217, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7130382061004639, + "num_tokens": 590900751.0, + "step": 22843 + }, + { + "epoch": 2.5086755985064793, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.347862720489502, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7062349319458008, + "num_tokens": 590929624.0, + "step": 22844 + }, + { + "epoch": 2.508785416209093, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.315119504928589, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7094888687133789, + "num_tokens": 590957559.0, + "step": 22845 + }, + { + "epoch": 2.5088952339117068, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.815089225769043, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.723425030708313, + "num_tokens": 590977189.0, + "step": 22846 + }, + { + "epoch": 2.50900505161432, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.374122381210327, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7224085330963135, + "num_tokens": 591006782.0, + "step": 22847 + }, + { + "epoch": 2.509114869316934, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.365328550338745, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7263240814208984, + "num_tokens": 591032821.0, + "step": 22848 + }, + { + "epoch": 2.5092246870195476, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.364438056945801, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7239605784416199, + "num_tokens": 591063078.0, + "step": 22849 + }, + { + "epoch": 2.5093345047221614, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.276118040084839, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7118978500366211, + "num_tokens": 591093057.0, + "step": 22850 + }, + { + "epoch": 2.509444322424775, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4587557315826416, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.733882486820221, + "num_tokens": 591117707.0, + "step": 22851 + }, + { + "epoch": 2.5095541401273884, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7886199951171875, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7221453785896301, + "num_tokens": 591139261.0, + "step": 22852 + }, + { + "epoch": 2.509663957830002, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5271782875061035, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7151812314987183, + "num_tokens": 591163001.0, + "step": 22853 + }, + { + "epoch": 2.509773775532616, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2900209426879883, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7220526933670044, + "num_tokens": 591191088.0, + "step": 22854 + }, + { + "epoch": 2.5098835932352292, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.390256643295288, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.691342294216156, + "num_tokens": 591217394.0, + "step": 22855 + }, + { + "epoch": 2.5099934109378434, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7400550842285156, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.725674033164978, + "num_tokens": 591238614.0, + "step": 22856 + }, + { + "epoch": 2.5101032286404568, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.483130931854248, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.712071418762207, + "num_tokens": 591264604.0, + "step": 22857 + }, + { + "epoch": 2.5102130463430705, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.622912645339966, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7353192567825317, + "num_tokens": 591287228.0, + "step": 22858 + }, + { + "epoch": 2.5103228640456843, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4744982719421387, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7068488597869873, + "num_tokens": 591312753.0, + "step": 22859 + }, + { + "epoch": 2.5104326817482976, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.484482526779175, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7122291922569275, + "num_tokens": 591338562.0, + "step": 22860 + }, + { + "epoch": 2.5105424994509113, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3036575317382812, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6907992362976074, + "num_tokens": 591366615.0, + "step": 22861 + }, + { + "epoch": 2.510652317153525, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.500354290008545, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7092894315719604, + "num_tokens": 591390433.0, + "step": 22862 + }, + { + "epoch": 2.510762134856139, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5401644706726074, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7033712863922119, + "num_tokens": 591412696.0, + "step": 22863 + }, + { + "epoch": 2.5108719525587526, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.316416025161743, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7193603515625, + "num_tokens": 591439559.0, + "step": 22864 + }, + { + "epoch": 2.510981770261366, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.520094633102417, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7475471496582031, + "num_tokens": 591459784.0, + "step": 22865 + }, + { + "epoch": 2.5110915879639797, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.494913339614868, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7190368175506592, + "num_tokens": 591483730.0, + "step": 22866 + }, + { + "epoch": 2.5112014056665934, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5101075172424316, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7208108901977539, + "num_tokens": 591509318.0, + "step": 22867 + }, + { + "epoch": 2.511311223369207, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6415202617645264, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7203410863876343, + "num_tokens": 591532774.0, + "step": 22868 + }, + { + "epoch": 2.511421041071821, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7866809368133545, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7047898769378662, + "num_tokens": 591553811.0, + "step": 22869 + }, + { + "epoch": 2.5115308587744343, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.081912040710449, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7028030753135681, + "num_tokens": 591587246.0, + "step": 22870 + }, + { + "epoch": 2.511640676477048, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.281313896179199, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6896681785583496, + "num_tokens": 591616083.0, + "step": 22871 + }, + { + "epoch": 2.511750494179662, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.271432638168335, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7413538694381714, + "num_tokens": 591642943.0, + "step": 22872 + }, + { + "epoch": 2.5118603118822755, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.591526985168457, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7315731048583984, + "num_tokens": 591668235.0, + "step": 22873 + }, + { + "epoch": 2.5119701295848893, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.361618757247925, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7111349701881409, + "num_tokens": 591696854.0, + "step": 22874 + }, + { + "epoch": 2.5120799472875026, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.471299409866333, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7074158787727356, + "num_tokens": 591722220.0, + "step": 22875 + }, + { + "epoch": 2.5121897649901164, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4223392009735107, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.698756754398346, + "num_tokens": 591749674.0, + "step": 22876 + }, + { + "epoch": 2.51229958269273, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3218894004821777, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.741012454032898, + "num_tokens": 591777655.0, + "step": 22877 + }, + { + "epoch": 2.512409400395344, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.1965601444244385, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7142207026481628, + "num_tokens": 591810654.0, + "step": 22878 + }, + { + "epoch": 2.5125192180979576, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4883816242218018, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7158706188201904, + "num_tokens": 591838576.0, + "step": 22879 + }, + { + "epoch": 2.512629035800571, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.528291940689087, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7446037530899048, + "num_tokens": 591860586.0, + "step": 22880 + }, + { + "epoch": 2.5127388535031847, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6058547496795654, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7222057580947876, + "num_tokens": 591885342.0, + "step": 22881 + }, + { + "epoch": 2.5128486712057985, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6170687675476074, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7265722751617432, + "num_tokens": 591907658.0, + "step": 22882 + }, + { + "epoch": 2.512958488908412, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.526507616043091, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7125891447067261, + "num_tokens": 591933944.0, + "step": 22883 + }, + { + "epoch": 2.5130683066110255, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.2838363647460938, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7081908583641052, + "num_tokens": 591965979.0, + "step": 22884 + }, + { + "epoch": 2.5131781243136393, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.7436182498931885, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7337130308151245, + "num_tokens": 591985122.0, + "step": 22885 + }, + { + "epoch": 2.513287942016253, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4759511947631836, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7162386178970337, + "num_tokens": 592009719.0, + "step": 22886 + }, + { + "epoch": 2.513397759718867, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.365593910217285, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7133404016494751, + "num_tokens": 592036278.0, + "step": 22887 + }, + { + "epoch": 2.51350757742148, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.307737112045288, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7097777128219604, + "num_tokens": 592062284.0, + "step": 22888 + }, + { + "epoch": 2.513617395124094, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.0294580459594727, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.70307856798172, + "num_tokens": 592098411.0, + "step": 22889 + }, + { + "epoch": 2.5137272128267076, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.54087233543396, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7232789993286133, + "num_tokens": 592124820.0, + "step": 22890 + }, + { + "epoch": 2.5138370305293214, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.8492610454559326, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7279373407363892, + "num_tokens": 592143372.0, + "step": 22891 + }, + { + "epoch": 2.513946848231935, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3199338912963867, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7099298238754272, + "num_tokens": 592171269.0, + "step": 22892 + }, + { + "epoch": 2.5140566659345485, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4141459465026855, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7102375030517578, + "num_tokens": 592196793.0, + "step": 22893 + }, + { + "epoch": 2.514166483637162, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.461458921432495, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7214993238449097, + "num_tokens": 592221733.0, + "step": 22894 + }, + { + "epoch": 2.514276301339776, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4551525115966797, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7041406035423279, + "num_tokens": 592247370.0, + "step": 22895 + }, + { + "epoch": 2.5143861190423897, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.2591159343719482, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6892609596252441, + "num_tokens": 592276498.0, + "step": 22896 + }, + { + "epoch": 2.5144959367450035, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5359504222869873, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7175811529159546, + "num_tokens": 592300544.0, + "step": 22897 + }, + { + "epoch": 2.514605754447617, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3968377113342285, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6995456218719482, + "num_tokens": 592330526.0, + "step": 22898 + }, + { + "epoch": 2.5147155721502306, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3754260540008545, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7258533239364624, + "num_tokens": 592358300.0, + "step": 22899 + }, + { + "epoch": 2.5148253898528443, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.425541877746582, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7175896167755127, + "num_tokens": 592385102.0, + "step": 22900 + }, + { + "epoch": 2.514935207555458, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.2396953105926514, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7223390936851501, + "num_tokens": 592415240.0, + "step": 22901 + }, + { + "epoch": 2.515045025258072, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.902621030807495, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7334259152412415, + "num_tokens": 592432834.0, + "step": 22902 + }, + { + "epoch": 2.515154842960685, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.530762195587158, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7319502234458923, + "num_tokens": 592457682.0, + "step": 22903 + }, + { + "epoch": 2.515264660663299, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4935333728790283, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7412710189819336, + "num_tokens": 592481379.0, + "step": 22904 + }, + { + "epoch": 2.5153744783659127, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.184250831604004, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.739710807800293, + "num_tokens": 592511498.0, + "step": 22905 + }, + { + "epoch": 2.515484296068526, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.613489866256714, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.711864709854126, + "num_tokens": 592535201.0, + "step": 22906 + }, + { + "epoch": 2.51559411377114, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.06607985496521, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7119258046150208, + "num_tokens": 592569210.0, + "step": 22907 + }, + { + "epoch": 2.5157039314737535, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.9913653135299683, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6986640691757202, + "num_tokens": 592606097.0, + "step": 22908 + }, + { + "epoch": 2.5158137491763672, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.8844735622406006, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7185460329055786, + "num_tokens": 592625761.0, + "step": 22909 + }, + { + "epoch": 2.515923566878981, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4548425674438477, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7383726835250854, + "num_tokens": 592649864.0, + "step": 22910 + }, + { + "epoch": 2.5160333845815943, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5312867164611816, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7126798629760742, + "num_tokens": 592674280.0, + "step": 22911 + }, + { + "epoch": 2.516143202284208, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.617051839828491, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7004331350326538, + "num_tokens": 592696943.0, + "step": 22912 + }, + { + "epoch": 2.516253019986822, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4942758083343506, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7188739776611328, + "num_tokens": 592722978.0, + "step": 22913 + }, + { + "epoch": 2.5163628376894356, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5032591819763184, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7337476015090942, + "num_tokens": 592746688.0, + "step": 22914 + }, + { + "epoch": 2.5164726553920493, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3280320167541504, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7149620056152344, + "num_tokens": 592776387.0, + "step": 22915 + }, + { + "epoch": 2.5165824730946627, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4605698585510254, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7009881734848022, + "num_tokens": 592800075.0, + "step": 22916 + }, + { + "epoch": 2.5166922907972764, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.263852834701538, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7319880127906799, + "num_tokens": 592826613.0, + "step": 22917 + }, + { + "epoch": 2.51680210849989, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.449152708053589, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7203918695449829, + "num_tokens": 592851609.0, + "step": 22918 + }, + { + "epoch": 2.516911926202504, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3923356533050537, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7151288986206055, + "num_tokens": 592879143.0, + "step": 22919 + }, + { + "epoch": 2.5170217439051177, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.418483257293701, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.718976616859436, + "num_tokens": 592905359.0, + "step": 22920 + }, + { + "epoch": 2.517131561607731, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.291433095932007, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7045594453811646, + "num_tokens": 592934634.0, + "step": 22921 + }, + { + "epoch": 2.5172413793103448, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4193005561828613, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7301130890846252, + "num_tokens": 592961039.0, + "step": 22922 + }, + { + "epoch": 2.5173511970129585, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4033660888671875, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7060481309890747, + "num_tokens": 592988378.0, + "step": 22923 + }, + { + "epoch": 2.5174610147155723, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3393030166625977, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7064887285232544, + "num_tokens": 593017157.0, + "step": 22924 + }, + { + "epoch": 2.517570832418186, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.27886700630188, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6886228322982788, + "num_tokens": 593052808.0, + "step": 22925 + }, + { + "epoch": 2.5176806501207993, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.459059000015259, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7337539196014404, + "num_tokens": 593077480.0, + "step": 22926 + }, + { + "epoch": 2.517790467823413, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.1078848838806152, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.719038188457489, + "num_tokens": 593111996.0, + "step": 22927 + }, + { + "epoch": 2.517900285526027, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.342560291290283, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6920371055603027, + "num_tokens": 593139114.0, + "step": 22928 + }, + { + "epoch": 2.5180101032286406, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4760336875915527, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7131722569465637, + "num_tokens": 593167042.0, + "step": 22929 + }, + { + "epoch": 2.5181199209312544, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5375277996063232, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7059072256088257, + "num_tokens": 593191849.0, + "step": 22930 + }, + { + "epoch": 2.5182297386338677, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.402555227279663, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7194386720657349, + "num_tokens": 593218714.0, + "step": 22931 + }, + { + "epoch": 2.5183395563364814, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3428096771240234, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7089128494262695, + "num_tokens": 593247208.0, + "step": 22932 + }, + { + "epoch": 2.518449374039095, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.46877384185791, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7028821706771851, + "num_tokens": 593272635.0, + "step": 22933 + }, + { + "epoch": 2.5185591917417085, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5422332286834717, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7132113575935364, + "num_tokens": 593297594.0, + "step": 22934 + }, + { + "epoch": 2.5186690094443223, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2873785495758057, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7075392007827759, + "num_tokens": 593329123.0, + "step": 22935 + }, + { + "epoch": 2.518778827146936, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.31315541267395, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7148218154907227, + "num_tokens": 593358256.0, + "step": 22936 + }, + { + "epoch": 2.5188886448495498, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7696454524993896, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7387651205062866, + "num_tokens": 593379323.0, + "step": 22937 + }, + { + "epoch": 2.5189984625521635, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2632229328155518, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7248508930206299, + "num_tokens": 593412384.0, + "step": 22938 + }, + { + "epoch": 2.519108280254777, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.435166120529175, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.705842137336731, + "num_tokens": 593439200.0, + "step": 22939 + }, + { + "epoch": 2.5192180979573906, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2990996837615967, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6945967674255371, + "num_tokens": 593467494.0, + "step": 22940 + }, + { + "epoch": 2.5193279156600044, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.158421516418457, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7167876958847046, + "num_tokens": 593497676.0, + "step": 22941 + }, + { + "epoch": 2.519437733362618, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6374478340148926, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7371426820755005, + "num_tokens": 593519999.0, + "step": 22942 + }, + { + "epoch": 2.519547551065232, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.423365592956543, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6933183670043945, + "num_tokens": 593546497.0, + "step": 22943 + }, + { + "epoch": 2.519657368767845, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.34847354888916, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7181811332702637, + "num_tokens": 593571869.0, + "step": 22944 + }, + { + "epoch": 2.519767186470459, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4749221801757812, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7197198271751404, + "num_tokens": 593595902.0, + "step": 22945 + }, + { + "epoch": 2.5198770041730727, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.792829990386963, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7359898090362549, + "num_tokens": 593616021.0, + "step": 22946 + }, + { + "epoch": 2.5199868218756865, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4877758026123047, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7008537650108337, + "num_tokens": 593640677.0, + "step": 22947 + }, + { + "epoch": 2.5200966395783, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.0924432277679443, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6962709426879883, + "num_tokens": 593675273.0, + "step": 22948 + }, + { + "epoch": 2.5202064572809135, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6312179565429688, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7061796188354492, + "num_tokens": 593698371.0, + "step": 22949 + }, + { + "epoch": 2.5203162749835273, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.737473726272583, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6928994059562683, + "num_tokens": 593719487.0, + "step": 22950 + }, + { + "epoch": 2.520426092686141, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.306184768676758, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7162491083145142, + "num_tokens": 593748209.0, + "step": 22951 + }, + { + "epoch": 2.520535910388755, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1615827083587646, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7093319296836853, + "num_tokens": 593779992.0, + "step": 22952 + }, + { + "epoch": 2.5206457280913686, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.473623752593994, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7240835428237915, + "num_tokens": 593805162.0, + "step": 22953 + }, + { + "epoch": 2.520755545793982, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5776212215423584, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7357897758483887, + "num_tokens": 593826809.0, + "step": 22954 + }, + { + "epoch": 2.5208653634965956, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.403470993041992, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7138698697090149, + "num_tokens": 593854502.0, + "step": 22955 + }, + { + "epoch": 2.5209751811992094, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.410727024078369, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6943937540054321, + "num_tokens": 593882364.0, + "step": 22956 + }, + { + "epoch": 2.521084998901823, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1723482608795166, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7078087329864502, + "num_tokens": 593914002.0, + "step": 22957 + }, + { + "epoch": 2.521194816604437, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2500531673431396, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7346758842468262, + "num_tokens": 593942545.0, + "step": 22958 + }, + { + "epoch": 2.52130463430705, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3962855339050293, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7120794057846069, + "num_tokens": 593969663.0, + "step": 22959 + }, + { + "epoch": 2.521414452009664, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.564527750015259, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7022961378097534, + "num_tokens": 593993735.0, + "step": 22960 + }, + { + "epoch": 2.5215242697122777, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2254743576049805, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7180382013320923, + "num_tokens": 594022149.0, + "step": 22961 + }, + { + "epoch": 2.521634087414891, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4471025466918945, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7209428548812866, + "num_tokens": 594048254.0, + "step": 22962 + }, + { + "epoch": 2.521743905117505, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.695969343185425, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7255666851997375, + "num_tokens": 594069964.0, + "step": 22963 + }, + { + "epoch": 2.5218537228201185, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4153194427490234, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7290288805961609, + "num_tokens": 594094639.0, + "step": 22964 + }, + { + "epoch": 2.5219635405227323, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8029308319091797, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7317699193954468, + "num_tokens": 594115174.0, + "step": 22965 + }, + { + "epoch": 2.522073358225346, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.477492094039917, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7208336591720581, + "num_tokens": 594140095.0, + "step": 22966 + }, + { + "epoch": 2.5221831759279594, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5683577060699463, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7093524932861328, + "num_tokens": 594162787.0, + "step": 22967 + }, + { + "epoch": 2.522292993630573, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6086554527282715, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.714309573173523, + "num_tokens": 594188094.0, + "step": 22968 + }, + { + "epoch": 2.522402811333187, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5833561420440674, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7139368653297424, + "num_tokens": 594213783.0, + "step": 22969 + }, + { + "epoch": 2.5225126290358006, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3423142433166504, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.704224169254303, + "num_tokens": 594240306.0, + "step": 22970 + }, + { + "epoch": 2.5226224467384144, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2067160606384277, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7181118726730347, + "num_tokens": 594269709.0, + "step": 22971 + }, + { + "epoch": 2.5227322644410277, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7270209789276123, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7312511801719666, + "num_tokens": 594289564.0, + "step": 22972 + }, + { + "epoch": 2.5228420821436415, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3205697536468506, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.711329996585846, + "num_tokens": 594317962.0, + "step": 22973 + }, + { + "epoch": 2.5229518998462552, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.534076452255249, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7294639348983765, + "num_tokens": 594340334.0, + "step": 22974 + }, + { + "epoch": 2.523061717548869, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6461575031280518, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7012292146682739, + "num_tokens": 594364305.0, + "step": 22975 + }, + { + "epoch": 2.5231715352514827, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8512089252471924, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7429157495498657, + "num_tokens": 594383972.0, + "step": 22976 + }, + { + "epoch": 2.523281352954096, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1958577632904053, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7209732532501221, + "num_tokens": 594413752.0, + "step": 22977 + }, + { + "epoch": 2.52339117065671, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4123291969299316, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6995658278465271, + "num_tokens": 594442079.0, + "step": 22978 + }, + { + "epoch": 2.5235009883593236, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6214237213134766, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6948257684707642, + "num_tokens": 594465901.0, + "step": 22979 + }, + { + "epoch": 2.5236108060619373, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3024179935455322, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7019903659820557, + "num_tokens": 594496278.0, + "step": 22980 + }, + { + "epoch": 2.523720623764551, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.379563331604004, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7218732833862305, + "num_tokens": 594520948.0, + "step": 22981 + }, + { + "epoch": 2.5238304414671644, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3076698780059814, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7236315011978149, + "num_tokens": 594548810.0, + "step": 22982 + }, + { + "epoch": 2.523940259169778, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.49714732170105, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6877472400665283, + "num_tokens": 594574861.0, + "step": 22983 + }, + { + "epoch": 2.524050076872392, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.233551502227783, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6978356242179871, + "num_tokens": 594604307.0, + "step": 22984 + }, + { + "epoch": 2.5241598945750052, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.458616018295288, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.7009677886962891, + "num_tokens": 594629850.0, + "step": 22985 + }, + { + "epoch": 2.5242697122776194, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.931460380554199, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7203720808029175, + "num_tokens": 594649519.0, + "step": 22986 + }, + { + "epoch": 2.5243795299802327, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.894979238510132, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7170246243476868, + "num_tokens": 594669664.0, + "step": 22987 + }, + { + "epoch": 2.5244893476828465, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.31569504737854, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.7041803598403931, + "num_tokens": 594698318.0, + "step": 22988 + }, + { + "epoch": 2.5245991653854603, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.501201629638672, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7041563391685486, + "num_tokens": 594724229.0, + "step": 22989 + }, + { + "epoch": 2.5247089830880736, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.550295829772949, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7083193063735962, + "num_tokens": 594750082.0, + "step": 22990 + }, + { + "epoch": 2.5248188007906873, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.783931016921997, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7262170910835266, + "num_tokens": 594769454.0, + "step": 22991 + }, + { + "epoch": 2.524928618493301, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6671810150146484, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7283691167831421, + "num_tokens": 594790967.0, + "step": 22992 + }, + { + "epoch": 2.525038436195915, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.666735887527466, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7014645338058472, + "num_tokens": 594813218.0, + "step": 22993 + }, + { + "epoch": 2.5251482538985286, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.198418378829956, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7049806118011475, + "num_tokens": 594844603.0, + "step": 22994 + }, + { + "epoch": 2.525258071601142, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.3789944648742676, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.710186779499054, + "num_tokens": 594871719.0, + "step": 22995 + }, + { + "epoch": 2.5253678893037557, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5453431606292725, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7100198268890381, + "num_tokens": 594895488.0, + "step": 22996 + }, + { + "epoch": 2.5254777070063694, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.560490846633911, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7209118604660034, + "num_tokens": 594919364.0, + "step": 22997 + }, + { + "epoch": 2.525587524708983, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.419995069503784, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7152087688446045, + "num_tokens": 594946118.0, + "step": 22998 + }, + { + "epoch": 2.525697342411597, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6208014488220215, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7027488946914673, + "num_tokens": 594969450.0, + "step": 22999 + }, + { + "epoch": 2.5258071601142102, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.8987667560577393, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7210456132888794, + "num_tokens": 594988795.0, + "step": 23000 + }, + { + "epoch": 2.525916977816824, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.6360957622528076, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7280372381210327, + "num_tokens": 595016239.0, + "step": 23001 + }, + { + "epoch": 2.5260267955194378, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5479557514190674, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7297708988189697, + "num_tokens": 595041564.0, + "step": 23002 + }, + { + "epoch": 2.5261366132220515, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.642120122909546, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7249779105186462, + "num_tokens": 595064933.0, + "step": 23003 + }, + { + "epoch": 2.5262464309246653, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.354274272918701, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7026074528694153, + "num_tokens": 595094820.0, + "step": 23004 + }, + { + "epoch": 2.5263562486272786, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.4341561794281006, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7380815744400024, + "num_tokens": 595120478.0, + "step": 23005 + }, + { + "epoch": 2.5264660663298923, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.5124876499176025, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7288886308670044, + "num_tokens": 595144689.0, + "step": 23006 + }, + { + "epoch": 2.526575884032506, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.312020778656006, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7146268486976624, + "num_tokens": 595171859.0, + "step": 23007 + }, + { + "epoch": 2.52668570173512, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4626100063323975, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7222967147827148, + "num_tokens": 595196206.0, + "step": 23008 + }, + { + "epoch": 2.5267955194377336, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7113966941833496, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6975094079971313, + "num_tokens": 595219237.0, + "step": 23009 + }, + { + "epoch": 2.526905337140347, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.294175148010254, + "learning_rate": 1e-06, + "loss": 1.1105, + "mean_token_accuracy": 0.6811116337776184, + "num_tokens": 595249726.0, + "step": 23010 + }, + { + "epoch": 2.5270151548429607, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4566545486450195, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7145724296569824, + "num_tokens": 595274027.0, + "step": 23011 + }, + { + "epoch": 2.5271249725455744, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.544290542602539, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7005919218063354, + "num_tokens": 595296661.0, + "step": 23012 + }, + { + "epoch": 2.5272347902481878, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6516306400299072, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7262140512466431, + "num_tokens": 595319437.0, + "step": 23013 + }, + { + "epoch": 2.5273446079508015, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.275186777114868, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6970376968383789, + "num_tokens": 595351096.0, + "step": 23014 + }, + { + "epoch": 2.5274544256534153, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.589291572570801, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7117205262184143, + "num_tokens": 595376735.0, + "step": 23015 + }, + { + "epoch": 2.527564243356029, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4491448402404785, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7089288234710693, + "num_tokens": 595404045.0, + "step": 23016 + }, + { + "epoch": 2.527674061058643, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.441373586654663, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7086648941040039, + "num_tokens": 595429491.0, + "step": 23017 + }, + { + "epoch": 2.527783878761256, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3719522953033447, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.696997344493866, + "num_tokens": 595457524.0, + "step": 23018 + }, + { + "epoch": 2.52789369646387, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.111211061477661, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7197496891021729, + "num_tokens": 595490527.0, + "step": 23019 + }, + { + "epoch": 2.5280035141664836, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6268649101257324, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7107160091400146, + "num_tokens": 595515326.0, + "step": 23020 + }, + { + "epoch": 2.5281133318690974, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1227612495422363, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7040094137191772, + "num_tokens": 595547889.0, + "step": 23021 + }, + { + "epoch": 2.528223149571711, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.306070566177368, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.7043600082397461, + "num_tokens": 595576016.0, + "step": 23022 + }, + { + "epoch": 2.5283329672743244, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5245871543884277, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7046274542808533, + "num_tokens": 595599972.0, + "step": 23023 + }, + { + "epoch": 2.528442784976938, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2667691707611084, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.719904899597168, + "num_tokens": 595629268.0, + "step": 23024 + }, + { + "epoch": 2.528552602679552, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5247585773468018, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7179276347160339, + "num_tokens": 595653835.0, + "step": 23025 + }, + { + "epoch": 2.5286624203821657, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.43159818649292, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7078512907028198, + "num_tokens": 595679525.0, + "step": 23026 + }, + { + "epoch": 2.5287722380847795, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.548578977584839, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6989794373512268, + "num_tokens": 595704461.0, + "step": 23027 + }, + { + "epoch": 2.528882055787393, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6691694259643555, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7161973118782043, + "num_tokens": 595726467.0, + "step": 23028 + }, + { + "epoch": 2.5289918734900065, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.572728395462036, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7242457866668701, + "num_tokens": 595749917.0, + "step": 23029 + }, + { + "epoch": 2.5291016911926203, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.331566333770752, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7002514600753784, + "num_tokens": 595778799.0, + "step": 23030 + }, + { + "epoch": 2.529211508895234, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.714155673980713, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7139629125595093, + "num_tokens": 595798731.0, + "step": 23031 + }, + { + "epoch": 2.529321326597848, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6462557315826416, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7054435610771179, + "num_tokens": 595824073.0, + "step": 23032 + }, + { + "epoch": 2.529431144300461, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4619359970092773, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7048637866973877, + "num_tokens": 595849564.0, + "step": 23033 + }, + { + "epoch": 2.529540962003075, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.369882106781006, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7120407223701477, + "num_tokens": 595878108.0, + "step": 23034 + }, + { + "epoch": 2.5296507797056886, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.250636100769043, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7014104127883911, + "num_tokens": 595906890.0, + "step": 23035 + }, + { + "epoch": 2.529760597408302, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7659735679626465, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7207784652709961, + "num_tokens": 595926522.0, + "step": 23036 + }, + { + "epoch": 2.529870415110916, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.468136787414551, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.708116888999939, + "num_tokens": 595949645.0, + "step": 23037 + }, + { + "epoch": 2.5299802328135295, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.344879150390625, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7071037888526917, + "num_tokens": 595977614.0, + "step": 23038 + }, + { + "epoch": 2.530090050516143, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.202585220336914, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.704258918762207, + "num_tokens": 596007884.0, + "step": 23039 + }, + { + "epoch": 2.530199868218757, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4620280265808105, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7338107824325562, + "num_tokens": 596031739.0, + "step": 23040 + }, + { + "epoch": 2.5303096859213703, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.8915140628814697, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7234635353088379, + "num_tokens": 596050106.0, + "step": 23041 + }, + { + "epoch": 2.530419503623984, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.737037420272827, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7545404434204102, + "num_tokens": 596069132.0, + "step": 23042 + }, + { + "epoch": 2.530529321326598, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.198270559310913, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7203975319862366, + "num_tokens": 596097303.0, + "step": 23043 + }, + { + "epoch": 2.5306391390292116, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5583112239837646, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7304192185401917, + "num_tokens": 596121083.0, + "step": 23044 + }, + { + "epoch": 2.5307489567318253, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3303639888763428, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7059558629989624, + "num_tokens": 596148564.0, + "step": 23045 + }, + { + "epoch": 2.5308587744344386, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.46606183052063, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7279331684112549, + "num_tokens": 596174298.0, + "step": 23046 + }, + { + "epoch": 2.5309685921370524, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.473409652709961, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.691704511642456, + "num_tokens": 596198003.0, + "step": 23047 + }, + { + "epoch": 2.531078409839666, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.71492862701416, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7366265058517456, + "num_tokens": 596220404.0, + "step": 23048 + }, + { + "epoch": 2.53118822754228, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3327441215515137, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6974173784255981, + "num_tokens": 596253105.0, + "step": 23049 + }, + { + "epoch": 2.5312980452448937, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.47642183303833, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7145653963088989, + "num_tokens": 596277084.0, + "step": 23050 + }, + { + "epoch": 2.531407862947507, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.537733793258667, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6997189521789551, + "num_tokens": 596300341.0, + "step": 23051 + }, + { + "epoch": 2.5315176806501207, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5416460037231445, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7294719219207764, + "num_tokens": 596322781.0, + "step": 23052 + }, + { + "epoch": 2.5316274983527345, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.506784677505493, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7167696356773376, + "num_tokens": 596345388.0, + "step": 23053 + }, + { + "epoch": 2.5317373160553482, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.22900128364563, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7087730169296265, + "num_tokens": 596373827.0, + "step": 23054 + }, + { + "epoch": 2.531847133757962, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5415446758270264, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7298054695129395, + "num_tokens": 596396903.0, + "step": 23055 + }, + { + "epoch": 2.5319569514605753, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.411074161529541, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.7004374861717224, + "num_tokens": 596424384.0, + "step": 23056 + }, + { + "epoch": 2.532066769163189, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5266928672790527, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.714143693447113, + "num_tokens": 596448517.0, + "step": 23057 + }, + { + "epoch": 2.532176586865803, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.317121982574463, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6949473023414612, + "num_tokens": 596476470.0, + "step": 23058 + }, + { + "epoch": 2.5322864045684166, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2688474655151367, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7209811210632324, + "num_tokens": 596504634.0, + "step": 23059 + }, + { + "epoch": 2.5323962222710303, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2340586185455322, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6999092102050781, + "num_tokens": 596533937.0, + "step": 23060 + }, + { + "epoch": 2.5325060399736437, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.192898750305176, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6878309845924377, + "num_tokens": 596566767.0, + "step": 23061 + }, + { + "epoch": 2.5326158576762574, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4963295459747314, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7022900581359863, + "num_tokens": 596592004.0, + "step": 23062 + }, + { + "epoch": 2.532725675378871, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4609181880950928, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7193741202354431, + "num_tokens": 596617623.0, + "step": 23063 + }, + { + "epoch": 2.5328354930814845, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.2893357276916504, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7188855409622192, + "num_tokens": 596647337.0, + "step": 23064 + }, + { + "epoch": 2.5329453107840982, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.693169593811035, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7149397730827332, + "num_tokens": 596668598.0, + "step": 23065 + }, + { + "epoch": 2.533055128486712, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.327805280685425, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7107104063034058, + "num_tokens": 596699776.0, + "step": 23066 + }, + { + "epoch": 2.5331649461893258, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.480644464492798, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.7006453275680542, + "num_tokens": 596725281.0, + "step": 23067 + }, + { + "epoch": 2.5332747638919395, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.705867052078247, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7293914556503296, + "num_tokens": 596747043.0, + "step": 23068 + }, + { + "epoch": 2.533384581594553, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.671719551086426, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.688890814781189, + "num_tokens": 596771535.0, + "step": 23069 + }, + { + "epoch": 2.5334943992971666, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4953620433807373, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7033841609954834, + "num_tokens": 596798443.0, + "step": 23070 + }, + { + "epoch": 2.5336042169997803, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8293237686157227, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7285608053207397, + "num_tokens": 596819292.0, + "step": 23071 + }, + { + "epoch": 2.533714034702394, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4260449409484863, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.729528546333313, + "num_tokens": 596848840.0, + "step": 23072 + }, + { + "epoch": 2.533823852405008, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3696682453155518, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7176039218902588, + "num_tokens": 596876828.0, + "step": 23073 + }, + { + "epoch": 2.533933670107621, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.391273021697998, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7019914984703064, + "num_tokens": 596903668.0, + "step": 23074 + }, + { + "epoch": 2.534043487810235, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1029739379882812, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6985061168670654, + "num_tokens": 596939752.0, + "step": 23075 + }, + { + "epoch": 2.5341533055128487, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.558380365371704, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7025256156921387, + "num_tokens": 596965149.0, + "step": 23076 + }, + { + "epoch": 2.5342631232154624, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.519350051879883, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7216720581054688, + "num_tokens": 596988327.0, + "step": 23077 + }, + { + "epoch": 2.534372940918076, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.535759210586548, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7177876234054565, + "num_tokens": 597013645.0, + "step": 23078 + }, + { + "epoch": 2.5344827586206895, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.567690849304199, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.721217155456543, + "num_tokens": 597036964.0, + "step": 23079 + }, + { + "epoch": 2.5345925763233033, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.7036020755767822, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.738292396068573, + "num_tokens": 597055558.0, + "step": 23080 + }, + { + "epoch": 2.534702394025917, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4362940788269043, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7020736336708069, + "num_tokens": 597081249.0, + "step": 23081 + }, + { + "epoch": 2.5348122117285308, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4552929401397705, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7262458205223083, + "num_tokens": 597104520.0, + "step": 23082 + }, + { + "epoch": 2.5349220294311445, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1628758907318115, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7192773818969727, + "num_tokens": 597136510.0, + "step": 23083 + }, + { + "epoch": 2.535031847133758, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.694145441055298, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.70948725938797, + "num_tokens": 597157658.0, + "step": 23084 + }, + { + "epoch": 2.5351416648363716, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.0988011360168457, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7139608860015869, + "num_tokens": 597189398.0, + "step": 23085 + }, + { + "epoch": 2.5352514825389854, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.612694025039673, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7316702008247375, + "num_tokens": 597210921.0, + "step": 23086 + }, + { + "epoch": 2.5353613002415987, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3404507637023926, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7297284603118896, + "num_tokens": 597237123.0, + "step": 23087 + }, + { + "epoch": 2.535471117944213, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5718095302581787, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7272247076034546, + "num_tokens": 597260578.0, + "step": 23088 + }, + { + "epoch": 2.535580935646826, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.3420722484588623, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7244850397109985, + "num_tokens": 597289215.0, + "step": 23089 + }, + { + "epoch": 2.53569075334944, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.373976707458496, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7108680009841919, + "num_tokens": 597316786.0, + "step": 23090 + }, + { + "epoch": 2.5358005710520537, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.486130952835083, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7036877870559692, + "num_tokens": 597340930.0, + "step": 23091 + }, + { + "epoch": 2.535910388754667, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.6865289211273193, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7105147838592529, + "num_tokens": 597364706.0, + "step": 23092 + }, + { + "epoch": 2.5360202064572808, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.682739734649658, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7089288234710693, + "num_tokens": 597386630.0, + "step": 23093 + }, + { + "epoch": 2.5361300241598945, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5576741695404053, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7204850316047668, + "num_tokens": 597410659.0, + "step": 23094 + }, + { + "epoch": 2.5362398418625083, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4882631301879883, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7145271301269531, + "num_tokens": 597437572.0, + "step": 23095 + }, + { + "epoch": 2.536349659565122, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.380596160888672, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.717930793762207, + "num_tokens": 597460736.0, + "step": 23096 + }, + { + "epoch": 2.5364594772677354, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4296467304229736, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7138647437095642, + "num_tokens": 597485885.0, + "step": 23097 + }, + { + "epoch": 2.536569294970349, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.4020142555236816, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.71848064661026, + "num_tokens": 597511911.0, + "step": 23098 + }, + { + "epoch": 2.536679112672963, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.318347454071045, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7191948294639587, + "num_tokens": 597540549.0, + "step": 23099 + }, + { + "epoch": 2.5367889303755766, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.328217029571533, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7130374908447266, + "num_tokens": 597568988.0, + "step": 23100 + }, + { + "epoch": 2.5368987480781904, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.234070301055908, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7086992859840393, + "num_tokens": 597599355.0, + "step": 23101 + }, + { + "epoch": 2.5370085657808037, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.319181442260742, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7170069217681885, + "num_tokens": 597626892.0, + "step": 23102 + }, + { + "epoch": 2.5371183834834174, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2135097980499268, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7061856985092163, + "num_tokens": 597656696.0, + "step": 23103 + }, + { + "epoch": 2.537228201186031, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2551212310791016, + "learning_rate": 1e-06, + "loss": 1.1315, + "mean_token_accuracy": 0.6781816482543945, + "num_tokens": 597686492.0, + "step": 23104 + }, + { + "epoch": 2.537338018888645, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.1650137901306152, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6933243870735168, + "num_tokens": 597720272.0, + "step": 23105 + }, + { + "epoch": 2.5374478365912587, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.563386917114258, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7129784822463989, + "num_tokens": 597745245.0, + "step": 23106 + }, + { + "epoch": 2.537557654293872, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.5541365146636963, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7158383727073669, + "num_tokens": 597769483.0, + "step": 23107 + }, + { + "epoch": 2.537667471996486, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.385313034057617, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7248721122741699, + "num_tokens": 597794789.0, + "step": 23108 + }, + { + "epoch": 2.5377772896990995, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.299912452697754, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7155587673187256, + "num_tokens": 597823774.0, + "step": 23109 + }, + { + "epoch": 2.5378871074017133, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.278071403503418, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7077234983444214, + "num_tokens": 597854226.0, + "step": 23110 + }, + { + "epoch": 2.537996925104327, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.547048568725586, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7277007102966309, + "num_tokens": 597879666.0, + "step": 23111 + }, + { + "epoch": 2.5381067428069404, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.58520245552063, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7276148796081543, + "num_tokens": 597903409.0, + "step": 23112 + }, + { + "epoch": 2.538216560509554, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.444124937057495, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.722251832485199, + "num_tokens": 597932638.0, + "step": 23113 + }, + { + "epoch": 2.538326378212168, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5601794719696045, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7289931774139404, + "num_tokens": 597954380.0, + "step": 23114 + }, + { + "epoch": 2.538436195914781, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.484219789505005, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7106008529663086, + "num_tokens": 597980225.0, + "step": 23115 + }, + { + "epoch": 2.538546013617395, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 2.648224115371704, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7195818424224854, + "num_tokens": 598001818.0, + "step": 23116 + }, + { + "epoch": 2.5386558313200087, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2570536136627197, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.722031831741333, + "num_tokens": 598031330.0, + "step": 23117 + }, + { + "epoch": 2.5387656490226225, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.680274724960327, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7249249815940857, + "num_tokens": 598052679.0, + "step": 23118 + }, + { + "epoch": 2.5388754667252362, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.662548065185547, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7274122834205627, + "num_tokens": 598073395.0, + "step": 23119 + }, + { + "epoch": 2.5389852844278495, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.454803466796875, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7163534760475159, + "num_tokens": 598099590.0, + "step": 23120 + }, + { + "epoch": 2.5390951021304633, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 3.020143985748291, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7421696186065674, + "num_tokens": 598117357.0, + "step": 23121 + }, + { + "epoch": 2.539204919833077, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.203681707382202, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7330226302146912, + "num_tokens": 598146407.0, + "step": 23122 + }, + { + "epoch": 2.539314737535691, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6666433811187744, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7112576961517334, + "num_tokens": 598168807.0, + "step": 23123 + }, + { + "epoch": 2.5394245552383046, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5410678386688232, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7188029885292053, + "num_tokens": 598193948.0, + "step": 23124 + }, + { + "epoch": 2.539534372940918, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3600540161132812, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7184113264083862, + "num_tokens": 598220707.0, + "step": 23125 + }, + { + "epoch": 2.5396441906435316, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3803465366363525, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7265204191207886, + "num_tokens": 598246641.0, + "step": 23126 + }, + { + "epoch": 2.5397540083461454, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.350686550140381, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7189837098121643, + "num_tokens": 598275816.0, + "step": 23127 + }, + { + "epoch": 2.539863826048759, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3543128967285156, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7139185667037964, + "num_tokens": 598304609.0, + "step": 23128 + }, + { + "epoch": 2.539973643751373, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.253016471862793, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.7015502452850342, + "num_tokens": 598332952.0, + "step": 23129 + }, + { + "epoch": 2.5400834614539862, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.315990924835205, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6963551044464111, + "num_tokens": 598361065.0, + "step": 23130 + }, + { + "epoch": 2.5401932791566, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2257144451141357, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6921881437301636, + "num_tokens": 598394400.0, + "step": 23131 + }, + { + "epoch": 2.5403030968592137, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.381939649581909, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7219406366348267, + "num_tokens": 598421433.0, + "step": 23132 + }, + { + "epoch": 2.5404129145618275, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7252442836761475, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7153380513191223, + "num_tokens": 598441169.0, + "step": 23133 + }, + { + "epoch": 2.5405227322644413, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.653949737548828, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.7167071104049683, + "num_tokens": 598463886.0, + "step": 23134 + }, + { + "epoch": 2.5406325499670546, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.773545742034912, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7074537873268127, + "num_tokens": 598485379.0, + "step": 23135 + }, + { + "epoch": 2.5407423676696683, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.55698561668396, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7143616080284119, + "num_tokens": 598511676.0, + "step": 23136 + }, + { + "epoch": 2.540852185372282, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.751274824142456, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7014517784118652, + "num_tokens": 598533470.0, + "step": 23137 + }, + { + "epoch": 2.540962003074896, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.470830202102661, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7040226459503174, + "num_tokens": 598560612.0, + "step": 23138 + }, + { + "epoch": 2.5410718207775096, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5373034477233887, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7277327179908752, + "num_tokens": 598583473.0, + "step": 23139 + }, + { + "epoch": 2.541181638480123, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3690731525421143, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7136921882629395, + "num_tokens": 598611489.0, + "step": 23140 + }, + { + "epoch": 2.5412914561827367, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.605590581893921, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7149043679237366, + "num_tokens": 598633506.0, + "step": 23141 + }, + { + "epoch": 2.5414012738853504, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.660306453704834, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7264008522033691, + "num_tokens": 598655695.0, + "step": 23142 + }, + { + "epoch": 2.5415110915879637, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4282140731811523, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.730663001537323, + "num_tokens": 598678323.0, + "step": 23143 + }, + { + "epoch": 2.5416209092905775, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.401440382003784, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7247234582901001, + "num_tokens": 598702544.0, + "step": 23144 + }, + { + "epoch": 2.5417307269931912, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.499268054962158, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6960452198982239, + "num_tokens": 598728721.0, + "step": 23145 + }, + { + "epoch": 2.541840544695805, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7584424018859863, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7346993684768677, + "num_tokens": 598748373.0, + "step": 23146 + }, + { + "epoch": 2.5419503623984188, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5681726932525635, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7153126001358032, + "num_tokens": 598772238.0, + "step": 23147 + }, + { + "epoch": 2.542060180101032, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4210503101348877, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.71772301197052, + "num_tokens": 598798213.0, + "step": 23148 + }, + { + "epoch": 2.542169997803646, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.351698160171509, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6925866603851318, + "num_tokens": 598825979.0, + "step": 23149 + }, + { + "epoch": 2.5422798155062596, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.625838041305542, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.72579026222229, + "num_tokens": 598848689.0, + "step": 23150 + }, + { + "epoch": 2.5423896332088733, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2046539783477783, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7162043452262878, + "num_tokens": 598879821.0, + "step": 23151 + }, + { + "epoch": 2.542499450911487, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2157304286956787, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.698915421962738, + "num_tokens": 598912920.0, + "step": 23152 + }, + { + "epoch": 2.5426092686141004, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2476255893707275, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7180348634719849, + "num_tokens": 598941236.0, + "step": 23153 + }, + { + "epoch": 2.542719086316714, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.0956015586853027, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7234835624694824, + "num_tokens": 598974363.0, + "step": 23154 + }, + { + "epoch": 2.542828904019328, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4851787090301514, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7060257792472839, + "num_tokens": 598998809.0, + "step": 23155 + }, + { + "epoch": 2.5429387217219417, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.432190179824829, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7118990421295166, + "num_tokens": 599030383.0, + "step": 23156 + }, + { + "epoch": 2.5430485394245554, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.255872964859009, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7107482552528381, + "num_tokens": 599061889.0, + "step": 23157 + }, + { + "epoch": 2.5431583571271688, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4733502864837646, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7006775140762329, + "num_tokens": 599087343.0, + "step": 23158 + }, + { + "epoch": 2.5432681748297825, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 6.956765174865723, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7046515941619873, + "num_tokens": 599116440.0, + "step": 23159 + }, + { + "epoch": 2.5433779925323963, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.805657386779785, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7324297428131104, + "num_tokens": 599137004.0, + "step": 23160 + }, + { + "epoch": 2.54348781023501, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.435476303100586, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7053077220916748, + "num_tokens": 599164010.0, + "step": 23161 + }, + { + "epoch": 2.543597627937624, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8982467651367188, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7323974370956421, + "num_tokens": 599186751.0, + "step": 23162 + }, + { + "epoch": 2.543707445640237, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4072937965393066, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6985901594161987, + "num_tokens": 599216747.0, + "step": 23163 + }, + { + "epoch": 2.543817263342851, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3903968334198, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7177586555480957, + "num_tokens": 599242956.0, + "step": 23164 + }, + { + "epoch": 2.5439270810454646, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4890024662017822, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.717221736907959, + "num_tokens": 599268162.0, + "step": 23165 + }, + { + "epoch": 2.544036898748078, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5493006706237793, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7175460457801819, + "num_tokens": 599292277.0, + "step": 23166 + }, + { + "epoch": 2.544146716450692, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7560031414031982, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7274818420410156, + "num_tokens": 599314496.0, + "step": 23167 + }, + { + "epoch": 2.5442565341533054, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3791697025299072, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6960387825965881, + "num_tokens": 599341428.0, + "step": 23168 + }, + { + "epoch": 2.544366351855919, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7228941917419434, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7423794865608215, + "num_tokens": 599362376.0, + "step": 23169 + }, + { + "epoch": 2.544476169558533, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7171263694763184, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7373278141021729, + "num_tokens": 599382189.0, + "step": 23170 + }, + { + "epoch": 2.5445859872611463, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.693455219268799, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7230398058891296, + "num_tokens": 599404613.0, + "step": 23171 + }, + { + "epoch": 2.54469580496376, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 3.0151309967041016, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7180769443511963, + "num_tokens": 599422510.0, + "step": 23172 + }, + { + "epoch": 2.544805622666374, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6330604553222656, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7087135314941406, + "num_tokens": 599448293.0, + "step": 23173 + }, + { + "epoch": 2.5449154403689875, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2996737957000732, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6962000131607056, + "num_tokens": 599478848.0, + "step": 23174 + }, + { + "epoch": 2.5450252580716013, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.505013942718506, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7312372922897339, + "num_tokens": 599502001.0, + "step": 23175 + }, + { + "epoch": 2.5451350757742146, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3642523288726807, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7141643762588501, + "num_tokens": 599528639.0, + "step": 23176 + }, + { + "epoch": 2.5452448934768284, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4421756267547607, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6830167174339294, + "num_tokens": 599556815.0, + "step": 23177 + }, + { + "epoch": 2.545354711179442, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5514230728149414, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7351635694503784, + "num_tokens": 599580546.0, + "step": 23178 + }, + { + "epoch": 2.545464528882056, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3145275115966797, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7041320204734802, + "num_tokens": 599610573.0, + "step": 23179 + }, + { + "epoch": 2.5455743465846696, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.660658597946167, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.7033933997154236, + "num_tokens": 599632809.0, + "step": 23180 + }, + { + "epoch": 2.545684164287283, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.374603033065796, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6962872743606567, + "num_tokens": 599660754.0, + "step": 23181 + }, + { + "epoch": 2.5457939819898967, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.161104440689087, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7077692151069641, + "num_tokens": 599692272.0, + "step": 23182 + }, + { + "epoch": 2.5459037996925105, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2598016262054443, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7012468576431274, + "num_tokens": 599722418.0, + "step": 23183 + }, + { + "epoch": 2.546013617395124, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3072211742401123, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7183398604393005, + "num_tokens": 599750615.0, + "step": 23184 + }, + { + "epoch": 2.546123435097738, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.332829475402832, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7251997590065002, + "num_tokens": 599779424.0, + "step": 23185 + }, + { + "epoch": 2.5462332528003513, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5624263286590576, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7241344451904297, + "num_tokens": 599804042.0, + "step": 23186 + }, + { + "epoch": 2.546343070502965, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5266716480255127, + "learning_rate": 1e-06, + "loss": 1.1163, + "mean_token_accuracy": 0.676332950592041, + "num_tokens": 599831383.0, + "step": 23187 + }, + { + "epoch": 2.546452888205579, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5574848651885986, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7272706031799316, + "num_tokens": 599855286.0, + "step": 23188 + }, + { + "epoch": 2.5465627059081926, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6072561740875244, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7238689661026001, + "num_tokens": 599878271.0, + "step": 23189 + }, + { + "epoch": 2.5466725236108063, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.35070538520813, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7158982753753662, + "num_tokens": 599905729.0, + "step": 23190 + }, + { + "epoch": 2.5467823413134196, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6543240547180176, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7399399280548096, + "num_tokens": 599928077.0, + "step": 23191 + }, + { + "epoch": 2.5468921590160334, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6888957023620605, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7295113801956177, + "num_tokens": 599951375.0, + "step": 23192 + }, + { + "epoch": 2.547001976718647, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.395427703857422, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7127010226249695, + "num_tokens": 599976493.0, + "step": 23193 + }, + { + "epoch": 2.5471117944212605, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6041903495788574, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7224280834197998, + "num_tokens": 600000689.0, + "step": 23194 + }, + { + "epoch": 2.547221612123874, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.288484573364258, + "learning_rate": 1e-06, + "loss": 1.1088, + "mean_token_accuracy": 0.679862916469574, + "num_tokens": 600031697.0, + "step": 23195 + }, + { + "epoch": 2.547331429826488, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.313944101333618, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7347836494445801, + "num_tokens": 600056997.0, + "step": 23196 + }, + { + "epoch": 2.5474412475291017, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.556274175643921, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.728062093257904, + "num_tokens": 600081186.0, + "step": 23197 + }, + { + "epoch": 2.5475510652317155, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4611475467681885, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6990891695022583, + "num_tokens": 600106506.0, + "step": 23198 + }, + { + "epoch": 2.547660882934329, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.293238639831543, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6961522102355957, + "num_tokens": 600137370.0, + "step": 23199 + }, + { + "epoch": 2.5477707006369426, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.353083372116089, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7239639759063721, + "num_tokens": 600164594.0, + "step": 23200 + }, + { + "epoch": 2.5478805183395563, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4405081272125244, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7119351625442505, + "num_tokens": 600189231.0, + "step": 23201 + }, + { + "epoch": 2.54799033604217, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.432185649871826, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7182822227478027, + "num_tokens": 600214379.0, + "step": 23202 + }, + { + "epoch": 2.548100153744784, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.321229934692383, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7320019006729126, + "num_tokens": 600241873.0, + "step": 23203 + }, + { + "epoch": 2.548209971447397, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5226166248321533, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7224091291427612, + "num_tokens": 600265780.0, + "step": 23204 + }, + { + "epoch": 2.548319789150011, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.289090156555176, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7202212810516357, + "num_tokens": 600294184.0, + "step": 23205 + }, + { + "epoch": 2.5484296068526247, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3535234928131104, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7299105525016785, + "num_tokens": 600322499.0, + "step": 23206 + }, + { + "epoch": 2.5485394245552384, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3293354511260986, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7088711261749268, + "num_tokens": 600351212.0, + "step": 23207 + }, + { + "epoch": 2.548649242257852, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6096463203430176, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7041980028152466, + "num_tokens": 600376114.0, + "step": 23208 + }, + { + "epoch": 2.5487590599604655, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.693971872329712, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7136102914810181, + "num_tokens": 600397781.0, + "step": 23209 + }, + { + "epoch": 2.5488688776630792, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3019039630889893, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7360890507698059, + "num_tokens": 600425370.0, + "step": 23210 + }, + { + "epoch": 2.548978695365693, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4783072471618652, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.697494626045227, + "num_tokens": 600451785.0, + "step": 23211 + }, + { + "epoch": 2.5490885130683067, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2747373580932617, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7167499661445618, + "num_tokens": 600485089.0, + "step": 23212 + }, + { + "epoch": 2.5491983307709205, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7082247734069824, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7292808294296265, + "num_tokens": 600505764.0, + "step": 23213 + }, + { + "epoch": 2.549308148473534, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5201780796051025, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7218096256256104, + "num_tokens": 600529607.0, + "step": 23214 + }, + { + "epoch": 2.5494179661761476, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7669944763183594, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7439504861831665, + "num_tokens": 600549122.0, + "step": 23215 + }, + { + "epoch": 2.5495277838787613, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2436060905456543, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7183496356010437, + "num_tokens": 600578980.0, + "step": 23216 + }, + { + "epoch": 2.5496376015813746, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.536489963531494, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.748058021068573, + "num_tokens": 600602436.0, + "step": 23217 + }, + { + "epoch": 2.549747419283989, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2093818187713623, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7294570803642273, + "num_tokens": 600631297.0, + "step": 23218 + }, + { + "epoch": 2.549857236986602, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.509537935256958, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7131456732749939, + "num_tokens": 600656923.0, + "step": 23219 + }, + { + "epoch": 2.549967054689216, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3616342544555664, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7124943733215332, + "num_tokens": 600683939.0, + "step": 23220 + }, + { + "epoch": 2.5500768723918297, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.290240526199341, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7103176712989807, + "num_tokens": 600712750.0, + "step": 23221 + }, + { + "epoch": 2.550186690094443, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.529602289199829, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7031550407409668, + "num_tokens": 600736385.0, + "step": 23222 + }, + { + "epoch": 2.5502965077970567, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7431089878082275, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7310653924942017, + "num_tokens": 600756943.0, + "step": 23223 + }, + { + "epoch": 2.5504063254996705, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3767096996307373, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7095805406570435, + "num_tokens": 600784891.0, + "step": 23224 + }, + { + "epoch": 2.5505161432022843, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1518359184265137, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7096275687217712, + "num_tokens": 600815435.0, + "step": 23225 + }, + { + "epoch": 2.550625960904898, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3973894119262695, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6924656629562378, + "num_tokens": 600842585.0, + "step": 23226 + }, + { + "epoch": 2.5507357786075113, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3193557262420654, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7372337579727173, + "num_tokens": 600870637.0, + "step": 23227 + }, + { + "epoch": 2.550845596310125, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5406315326690674, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7198747396469116, + "num_tokens": 600894183.0, + "step": 23228 + }, + { + "epoch": 2.550955414012739, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4476351737976074, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.72083580493927, + "num_tokens": 600917597.0, + "step": 23229 + }, + { + "epoch": 2.5510652317153526, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.087393045425415, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7102667093276978, + "num_tokens": 600950127.0, + "step": 23230 + }, + { + "epoch": 2.5511750494179664, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.355536937713623, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7157274484634399, + "num_tokens": 600976942.0, + "step": 23231 + }, + { + "epoch": 2.5512848671205797, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.525991201400757, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7163522839546204, + "num_tokens": 601000981.0, + "step": 23232 + }, + { + "epoch": 2.5513946848231934, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.430443525314331, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7293359041213989, + "num_tokens": 601027115.0, + "step": 23233 + }, + { + "epoch": 2.551504502525807, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.50958514213562, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7446810007095337, + "num_tokens": 601050975.0, + "step": 23234 + }, + { + "epoch": 2.551614320228421, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.333544969558716, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7246785163879395, + "num_tokens": 601079541.0, + "step": 23235 + }, + { + "epoch": 2.5517241379310347, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.353290557861328, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7035098075866699, + "num_tokens": 601109464.0, + "step": 23236 + }, + { + "epoch": 2.551833955633648, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.255051374435425, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7329317331314087, + "num_tokens": 601137656.0, + "step": 23237 + }, + { + "epoch": 2.5519437733362618, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.535872220993042, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7206475734710693, + "num_tokens": 601160857.0, + "step": 23238 + }, + { + "epoch": 2.5520535910388755, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3527700901031494, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7298586368560791, + "num_tokens": 601187777.0, + "step": 23239 + }, + { + "epoch": 2.5521634087414893, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4787216186523438, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7323434352874756, + "num_tokens": 601211846.0, + "step": 23240 + }, + { + "epoch": 2.552273226444103, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.420391321182251, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7077556848526001, + "num_tokens": 601236545.0, + "step": 23241 + }, + { + "epoch": 2.5523830441467164, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2778031826019287, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7175067067146301, + "num_tokens": 601264423.0, + "step": 23242 + }, + { + "epoch": 2.55249286184933, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3938982486724854, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7076231837272644, + "num_tokens": 601290790.0, + "step": 23243 + }, + { + "epoch": 2.552602679551944, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.636566400527954, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7232741117477417, + "num_tokens": 601311762.0, + "step": 23244 + }, + { + "epoch": 2.552712497254557, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3899714946746826, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7175818681716919, + "num_tokens": 601338352.0, + "step": 23245 + }, + { + "epoch": 2.552822314957171, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5977492332458496, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7355160713195801, + "num_tokens": 601360971.0, + "step": 23246 + }, + { + "epoch": 2.5529321326597847, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.576932191848755, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7222316265106201, + "num_tokens": 601382171.0, + "step": 23247 + }, + { + "epoch": 2.5530419503623984, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.574401378631592, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7210127711296082, + "num_tokens": 601405621.0, + "step": 23248 + }, + { + "epoch": 2.553151768065012, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3506405353546143, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7106025218963623, + "num_tokens": 601433299.0, + "step": 23249 + }, + { + "epoch": 2.5532615857676255, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2759487628936768, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7211072444915771, + "num_tokens": 601461285.0, + "step": 23250 + }, + { + "epoch": 2.5533714034702393, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2718615531921387, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6952428817749023, + "num_tokens": 601491882.0, + "step": 23251 + }, + { + "epoch": 2.553481221172853, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.701338052749634, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7161627411842346, + "num_tokens": 601513290.0, + "step": 23252 + }, + { + "epoch": 2.553591038875467, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4438438415527344, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6909278631210327, + "num_tokens": 601538138.0, + "step": 23253 + }, + { + "epoch": 2.5537008565780805, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.334313154220581, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6956783533096313, + "num_tokens": 601568430.0, + "step": 23254 + }, + { + "epoch": 2.553810674280694, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.409099817276001, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.716937780380249, + "num_tokens": 601594573.0, + "step": 23255 + }, + { + "epoch": 2.5539204919833076, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.290800094604492, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7282112240791321, + "num_tokens": 601622798.0, + "step": 23256 + }, + { + "epoch": 2.5540303096859214, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6417653560638428, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7103095054626465, + "num_tokens": 601645628.0, + "step": 23257 + }, + { + "epoch": 2.554140127388535, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.777196168899536, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7070258259773254, + "num_tokens": 601666359.0, + "step": 23258 + }, + { + "epoch": 2.554249945091149, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6720612049102783, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7095131278038025, + "num_tokens": 601687415.0, + "step": 23259 + }, + { + "epoch": 2.554359762793762, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.468109130859375, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6905893683433533, + "num_tokens": 601712944.0, + "step": 23260 + }, + { + "epoch": 2.554469580496376, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4971160888671875, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7101765871047974, + "num_tokens": 601740554.0, + "step": 23261 + }, + { + "epoch": 2.5545793981989897, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.702969551086426, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7119632363319397, + "num_tokens": 601762863.0, + "step": 23262 + }, + { + "epoch": 2.5546892159016035, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4271275997161865, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7095744013786316, + "num_tokens": 601790171.0, + "step": 23263 + }, + { + "epoch": 2.5547990336042172, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2548274993896484, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7364662289619446, + "num_tokens": 601817728.0, + "step": 23264 + }, + { + "epoch": 2.5549088513068305, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 3.9417965412139893, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7195112109184265, + "num_tokens": 601840399.0, + "step": 23265 + }, + { + "epoch": 2.5550186690094443, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4494168758392334, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7303377389907837, + "num_tokens": 601864120.0, + "step": 23266 + }, + { + "epoch": 2.555128486712058, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5986690521240234, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7121754884719849, + "num_tokens": 601887840.0, + "step": 23267 + }, + { + "epoch": 2.5552383044146714, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.651085615158081, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.7006675601005554, + "num_tokens": 601911940.0, + "step": 23268 + }, + { + "epoch": 2.5553481221172856, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7560312747955322, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7098251581192017, + "num_tokens": 601932390.0, + "step": 23269 + }, + { + "epoch": 2.555457939819899, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2653839588165283, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7133506536483765, + "num_tokens": 601961872.0, + "step": 23270 + }, + { + "epoch": 2.5555677575225126, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3415560722351074, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7050079107284546, + "num_tokens": 601990854.0, + "step": 23271 + }, + { + "epoch": 2.5556775752251264, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7933809757232666, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7228704690933228, + "num_tokens": 602010988.0, + "step": 23272 + }, + { + "epoch": 2.5557873929277397, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.42914080619812, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7250516414642334, + "num_tokens": 602035175.0, + "step": 23273 + }, + { + "epoch": 2.5558972106303535, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.546114206314087, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7049884796142578, + "num_tokens": 602057633.0, + "step": 23274 + }, + { + "epoch": 2.5560070283329672, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5026700496673584, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7254889011383057, + "num_tokens": 602082040.0, + "step": 23275 + }, + { + "epoch": 2.556116846035581, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.639779567718506, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.710602879524231, + "num_tokens": 602104266.0, + "step": 23276 + }, + { + "epoch": 2.5562266637381947, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.302734613418579, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6986842751502991, + "num_tokens": 602131165.0, + "step": 23277 + }, + { + "epoch": 2.556336481440808, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.379404306411743, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7454968690872192, + "num_tokens": 602155657.0, + "step": 23278 + }, + { + "epoch": 2.556446299143422, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.36352801322937, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7162891626358032, + "num_tokens": 602181965.0, + "step": 23279 + }, + { + "epoch": 2.5565561168460356, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.25970196723938, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7048646211624146, + "num_tokens": 602211976.0, + "step": 23280 + }, + { + "epoch": 2.5566659345486493, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 3.735936403274536, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7383847236633301, + "num_tokens": 602239196.0, + "step": 23281 + }, + { + "epoch": 2.556775752251263, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2272837162017822, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.720539927482605, + "num_tokens": 602267444.0, + "step": 23282 + }, + { + "epoch": 2.5568855699538764, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.182676315307617, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.713172197341919, + "num_tokens": 602298803.0, + "step": 23283 + }, + { + "epoch": 2.55699538765649, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.291264533996582, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7177867293357849, + "num_tokens": 602328423.0, + "step": 23284 + }, + { + "epoch": 2.557105205359104, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.689547061920166, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7054717540740967, + "num_tokens": 602350943.0, + "step": 23285 + }, + { + "epoch": 2.5572150230617177, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3596763610839844, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7191699743270874, + "num_tokens": 602378886.0, + "step": 23286 + }, + { + "epoch": 2.5573248407643314, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5871825218200684, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7402555346488953, + "num_tokens": 602400804.0, + "step": 23287 + }, + { + "epoch": 2.5574346584669447, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.509319543838501, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7064350843429565, + "num_tokens": 602426240.0, + "step": 23288 + }, + { + "epoch": 2.5575444761695585, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.917898654937744, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7169225215911865, + "num_tokens": 602447651.0, + "step": 23289 + }, + { + "epoch": 2.5576542938721722, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.868469715118408, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7108020782470703, + "num_tokens": 602469324.0, + "step": 23290 + }, + { + "epoch": 2.557764111574786, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4666683673858643, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.737114667892456, + "num_tokens": 602493436.0, + "step": 23291 + }, + { + "epoch": 2.5578739292773998, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2923521995544434, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7034306526184082, + "num_tokens": 602524451.0, + "step": 23292 + }, + { + "epoch": 2.557983746980013, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3465890884399414, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7211573123931885, + "num_tokens": 602552139.0, + "step": 23293 + }, + { + "epoch": 2.558093564682627, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.178212881088257, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7187986969947815, + "num_tokens": 602584609.0, + "step": 23294 + }, + { + "epoch": 2.5582033823852406, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.247870683670044, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6926161646842957, + "num_tokens": 602613567.0, + "step": 23295 + }, + { + "epoch": 2.558313200087854, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 3.0384581089019775, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7288451790809631, + "num_tokens": 602630565.0, + "step": 23296 + }, + { + "epoch": 2.5584230177904677, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4795279502868652, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7029698491096497, + "num_tokens": 602655516.0, + "step": 23297 + }, + { + "epoch": 2.5585328354930814, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3342511653900146, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7191538214683533, + "num_tokens": 602681620.0, + "step": 23298 + }, + { + "epoch": 2.558642653195695, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.340933322906494, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7304535508155823, + "num_tokens": 602707308.0, + "step": 23299 + }, + { + "epoch": 2.558752470898309, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.390434741973877, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7187587022781372, + "num_tokens": 602731336.0, + "step": 23300 + }, + { + "epoch": 2.5588622886009222, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.824394702911377, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7386600375175476, + "num_tokens": 602751478.0, + "step": 23301 + }, + { + "epoch": 2.558972106303536, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4934935569763184, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6974223852157593, + "num_tokens": 602776797.0, + "step": 23302 + }, + { + "epoch": 2.5590819240061498, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 32.095458984375, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7280828952789307, + "num_tokens": 602799985.0, + "step": 23303 + }, + { + "epoch": 2.5591917417087635, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.637866973876953, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7117421627044678, + "num_tokens": 602825496.0, + "step": 23304 + }, + { + "epoch": 2.5593015594113773, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5696656703948975, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7039842009544373, + "num_tokens": 602849205.0, + "step": 23305 + }, + { + "epoch": 2.5594113771139906, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.257197856903076, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6991496086120605, + "num_tokens": 602879621.0, + "step": 23306 + }, + { + "epoch": 2.5595211948166043, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2347142696380615, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7241304516792297, + "num_tokens": 602908426.0, + "step": 23307 + }, + { + "epoch": 2.559631012519218, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3216261863708496, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7250333428382874, + "num_tokens": 602934702.0, + "step": 23308 + }, + { + "epoch": 2.559740830221832, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5675570964813232, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7155985832214355, + "num_tokens": 602956640.0, + "step": 23309 + }, + { + "epoch": 2.5598506479244456, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.419786214828491, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7298804521560669, + "num_tokens": 602980767.0, + "step": 23310 + }, + { + "epoch": 2.559960465627059, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6185929775238037, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7184504270553589, + "num_tokens": 603004907.0, + "step": 23311 + }, + { + "epoch": 2.5600702833296727, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3591058254241943, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7233775854110718, + "num_tokens": 603031709.0, + "step": 23312 + }, + { + "epoch": 2.5601801010322864, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.288764238357544, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6986140012741089, + "num_tokens": 603061858.0, + "step": 23313 + }, + { + "epoch": 2.5602899187349, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5303704738616943, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.723106861114502, + "num_tokens": 603085881.0, + "step": 23314 + }, + { + "epoch": 2.560399736437514, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.953507900238037, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7219339609146118, + "num_tokens": 603103768.0, + "step": 23315 + }, + { + "epoch": 2.5605095541401273, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.307814121246338, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7167785167694092, + "num_tokens": 603132650.0, + "step": 23316 + }, + { + "epoch": 2.560619371842741, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6026530265808105, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7220083475112915, + "num_tokens": 603157782.0, + "step": 23317 + }, + { + "epoch": 2.560729189545355, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.891141891479492, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7078700065612793, + "num_tokens": 603177854.0, + "step": 23318 + }, + { + "epoch": 2.5608390072479685, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.772806167602539, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7317100763320923, + "num_tokens": 603199065.0, + "step": 23319 + }, + { + "epoch": 2.5609488249505823, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.443997859954834, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7168610095977783, + "num_tokens": 603225984.0, + "step": 23320 + }, + { + "epoch": 2.5610586426531956, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3974132537841797, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7192987203598022, + "num_tokens": 603253866.0, + "step": 23321 + }, + { + "epoch": 2.5611684603558094, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5902161598205566, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7289976477622986, + "num_tokens": 603276823.0, + "step": 23322 + }, + { + "epoch": 2.561278278058423, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.577808380126953, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7360504269599915, + "num_tokens": 603298508.0, + "step": 23323 + }, + { + "epoch": 2.5613880957610364, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3229219913482666, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7047511339187622, + "num_tokens": 603329742.0, + "step": 23324 + }, + { + "epoch": 2.56149791346365, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4125986099243164, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6917288303375244, + "num_tokens": 603358198.0, + "step": 23325 + }, + { + "epoch": 2.561607731166264, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.473740577697754, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7205844521522522, + "num_tokens": 603383642.0, + "step": 23326 + }, + { + "epoch": 2.5617175488688777, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.400192975997925, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7317664623260498, + "num_tokens": 603409275.0, + "step": 23327 + }, + { + "epoch": 2.5618273665714915, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.210594654083252, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6985065937042236, + "num_tokens": 603440126.0, + "step": 23328 + }, + { + "epoch": 2.5619371842741048, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.781554937362671, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.721253514289856, + "num_tokens": 603459754.0, + "step": 23329 + }, + { + "epoch": 2.5620470019767185, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1859445571899414, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6972939968109131, + "num_tokens": 603491305.0, + "step": 23330 + }, + { + "epoch": 2.5621568196793323, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3698573112487793, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.725092351436615, + "num_tokens": 603516967.0, + "step": 23331 + }, + { + "epoch": 2.562266637381946, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3352723121643066, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7217998504638672, + "num_tokens": 603543598.0, + "step": 23332 + }, + { + "epoch": 2.56237645508456, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2843844890594482, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7152793407440186, + "num_tokens": 603572869.0, + "step": 23333 + }, + { + "epoch": 2.562486272787173, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7522623538970947, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7063892483711243, + "num_tokens": 603595639.0, + "step": 23334 + }, + { + "epoch": 2.562596090489787, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6654465198516846, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6972452402114868, + "num_tokens": 603619042.0, + "step": 23335 + }, + { + "epoch": 2.5627059081924006, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2007813453674316, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6999638080596924, + "num_tokens": 603654904.0, + "step": 23336 + }, + { + "epoch": 2.5628157258950144, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.44498610496521, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7136408686637878, + "num_tokens": 603679865.0, + "step": 23337 + }, + { + "epoch": 2.562925543597628, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.482626438140869, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7148100137710571, + "num_tokens": 603705678.0, + "step": 23338 + }, + { + "epoch": 2.5630353613002415, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.612071990966797, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6948870420455933, + "num_tokens": 603729771.0, + "step": 23339 + }, + { + "epoch": 2.563145179002855, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4295012950897217, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7046347856521606, + "num_tokens": 603755805.0, + "step": 23340 + }, + { + "epoch": 2.563254996705469, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3708789348602295, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7019972801208496, + "num_tokens": 603784069.0, + "step": 23341 + }, + { + "epoch": 2.5633648144080827, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4173803329467773, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7241482734680176, + "num_tokens": 603809966.0, + "step": 23342 + }, + { + "epoch": 2.5634746321106965, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5855445861816406, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6963630318641663, + "num_tokens": 603833997.0, + "step": 23343 + }, + { + "epoch": 2.56358444981331, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3333396911621094, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6979554295539856, + "num_tokens": 603862599.0, + "step": 23344 + }, + { + "epoch": 2.5636942675159236, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4560189247131348, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6958261132240295, + "num_tokens": 603889002.0, + "step": 23345 + }, + { + "epoch": 2.5638040852185373, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4351298809051514, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.7025283575057983, + "num_tokens": 603915766.0, + "step": 23346 + }, + { + "epoch": 2.5639139029211506, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1783275604248047, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6910910606384277, + "num_tokens": 603949935.0, + "step": 23347 + }, + { + "epoch": 2.564023720623765, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.424386501312256, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7343074083328247, + "num_tokens": 603976908.0, + "step": 23348 + }, + { + "epoch": 2.564133538326378, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5164191722869873, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7079339623451233, + "num_tokens": 604002638.0, + "step": 23349 + }, + { + "epoch": 2.564243356028992, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2781543731689453, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6978486776351929, + "num_tokens": 604032425.0, + "step": 23350 + }, + { + "epoch": 2.5643531737316057, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.245081901550293, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7133021950721741, + "num_tokens": 604062291.0, + "step": 23351 + }, + { + "epoch": 2.564462991434219, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5859758853912354, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7106577157974243, + "num_tokens": 604085436.0, + "step": 23352 + }, + { + "epoch": 2.5645728091368327, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.655351161956787, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7038755416870117, + "num_tokens": 604108637.0, + "step": 23353 + }, + { + "epoch": 2.5646826268394465, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5766146183013916, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.7067131400108337, + "num_tokens": 604133226.0, + "step": 23354 + }, + { + "epoch": 2.5647924445420602, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4654808044433594, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7126592397689819, + "num_tokens": 604159178.0, + "step": 23355 + }, + { + "epoch": 2.564902262244674, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6224968433380127, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7228896617889404, + "num_tokens": 604181713.0, + "step": 23356 + }, + { + "epoch": 2.5650120799472873, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.246303081512451, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7097015380859375, + "num_tokens": 604211696.0, + "step": 23357 + }, + { + "epoch": 2.565121897649901, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.327643394470215, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.72395920753479, + "num_tokens": 604238132.0, + "step": 23358 + }, + { + "epoch": 2.565231715352515, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2916529178619385, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7168519496917725, + "num_tokens": 604267759.0, + "step": 23359 + }, + { + "epoch": 2.5653415330551286, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4368367195129395, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7110797166824341, + "num_tokens": 604295286.0, + "step": 23360 + }, + { + "epoch": 2.5654513507577423, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5810859203338623, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7205449342727661, + "num_tokens": 604318751.0, + "step": 23361 + }, + { + "epoch": 2.5655611684603556, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.301362991333008, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7399437427520752, + "num_tokens": 604343713.0, + "step": 23362 + }, + { + "epoch": 2.5656709861629694, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1270389556884766, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.7007501721382141, + "num_tokens": 604376454.0, + "step": 23363 + }, + { + "epoch": 2.565780803865583, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7199149131774902, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.72807776927948, + "num_tokens": 604397121.0, + "step": 23364 + }, + { + "epoch": 2.565890621568197, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6401326656341553, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.745487630367279, + "num_tokens": 604417671.0, + "step": 23365 + }, + { + "epoch": 2.5660004392708107, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.463911533355713, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7065079808235168, + "num_tokens": 604441561.0, + "step": 23366 + }, + { + "epoch": 2.566110256973424, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4097697734832764, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7106373906135559, + "num_tokens": 604467657.0, + "step": 23367 + }, + { + "epoch": 2.5662200746760377, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.317901134490967, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7004752159118652, + "num_tokens": 604497964.0, + "step": 23368 + }, + { + "epoch": 2.5663298923786515, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4442310333251953, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7099995017051697, + "num_tokens": 604523342.0, + "step": 23369 + }, + { + "epoch": 2.5664397100812653, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4974122047424316, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7063318490982056, + "num_tokens": 604548761.0, + "step": 23370 + }, + { + "epoch": 2.566549527783879, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.513751745223999, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6977505087852478, + "num_tokens": 604574531.0, + "step": 23371 + }, + { + "epoch": 2.5666593454864923, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2189955711364746, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6974232792854309, + "num_tokens": 604605288.0, + "step": 23372 + }, + { + "epoch": 2.566769163189106, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7932255268096924, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7385165095329285, + "num_tokens": 604624112.0, + "step": 23373 + }, + { + "epoch": 2.56687898089172, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5165340900421143, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7253686785697937, + "num_tokens": 604647904.0, + "step": 23374 + }, + { + "epoch": 2.566988798594333, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4473516941070557, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7110247015953064, + "num_tokens": 604674123.0, + "step": 23375 + }, + { + "epoch": 2.567098616296947, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.557511568069458, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.728082537651062, + "num_tokens": 604698545.0, + "step": 23376 + }, + { + "epoch": 2.5672084339995607, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2084109783172607, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7131007313728333, + "num_tokens": 604728830.0, + "step": 23377 + }, + { + "epoch": 2.5673182517021744, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.546196937561035, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7361582517623901, + "num_tokens": 604753387.0, + "step": 23378 + }, + { + "epoch": 2.567428069404788, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5184032917022705, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7332549095153809, + "num_tokens": 604778137.0, + "step": 23379 + }, + { + "epoch": 2.5675378871074015, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 1.94322669506073, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7192285060882568, + "num_tokens": 604814087.0, + "step": 23380 + }, + { + "epoch": 2.5676477048100153, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5798068046569824, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7413448095321655, + "num_tokens": 604836600.0, + "step": 23381 + }, + { + "epoch": 2.567757522512629, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7905101776123047, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7393288016319275, + "num_tokens": 604855732.0, + "step": 23382 + }, + { + "epoch": 2.5678673402152428, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2856850624084473, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7376048564910889, + "num_tokens": 604883780.0, + "step": 23383 + }, + { + "epoch": 2.5679771579178565, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.494473695755005, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7344581484794617, + "num_tokens": 604906772.0, + "step": 23384 + }, + { + "epoch": 2.56808697562047, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.356670618057251, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6838657855987549, + "num_tokens": 604937195.0, + "step": 23385 + }, + { + "epoch": 2.5681967933230836, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.437326669692993, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7099226713180542, + "num_tokens": 604964163.0, + "step": 23386 + }, + { + "epoch": 2.5683066110256974, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2995593547821045, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7019290924072266, + "num_tokens": 604992272.0, + "step": 23387 + }, + { + "epoch": 2.568416428728311, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.438892364501953, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7129887342453003, + "num_tokens": 605016750.0, + "step": 23388 + }, + { + "epoch": 2.568526246430925, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.434891939163208, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7370034456253052, + "num_tokens": 605041357.0, + "step": 23389 + }, + { + "epoch": 2.568636064133538, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.289923667907715, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7300050258636475, + "num_tokens": 605067758.0, + "step": 23390 + }, + { + "epoch": 2.568745881836152, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4845077991485596, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.720525324344635, + "num_tokens": 605092976.0, + "step": 23391 + }, + { + "epoch": 2.5688556995387657, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.372673511505127, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7088596224784851, + "num_tokens": 605121236.0, + "step": 23392 + }, + { + "epoch": 2.5689655172413794, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.244375228881836, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.711252748966217, + "num_tokens": 605152048.0, + "step": 23393 + }, + { + "epoch": 2.569075334943993, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.724982738494873, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7316558361053467, + "num_tokens": 605172717.0, + "step": 23394 + }, + { + "epoch": 2.5691851526466065, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5449676513671875, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.735066294670105, + "num_tokens": 605193782.0, + "step": 23395 + }, + { + "epoch": 2.5692949703492203, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4802162647247314, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7139439582824707, + "num_tokens": 605218931.0, + "step": 23396 + }, + { + "epoch": 2.569404788051834, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.37630295753479, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7076767683029175, + "num_tokens": 605246891.0, + "step": 23397 + }, + { + "epoch": 2.5695146057544473, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.41142201423645, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7239255309104919, + "num_tokens": 605272368.0, + "step": 23398 + }, + { + "epoch": 2.5696244234570615, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7450804710388184, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7314523458480835, + "num_tokens": 605292360.0, + "step": 23399 + }, + { + "epoch": 2.569734241159675, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4537911415100098, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6978464126586914, + "num_tokens": 605322031.0, + "step": 23400 + }, + { + "epoch": 2.5698440588622886, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.572675943374634, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7189537286758423, + "num_tokens": 605347994.0, + "step": 23401 + }, + { + "epoch": 2.5699538765649024, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5800905227661133, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.710745096206665, + "num_tokens": 605372584.0, + "step": 23402 + }, + { + "epoch": 2.5700636942675157, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.436819553375244, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7212107181549072, + "num_tokens": 605398031.0, + "step": 23403 + }, + { + "epoch": 2.5701735119701294, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8218984603881836, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7078490257263184, + "num_tokens": 605420220.0, + "step": 23404 + }, + { + "epoch": 2.570283329672743, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.405447006225586, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6972301602363586, + "num_tokens": 605447632.0, + "step": 23405 + }, + { + "epoch": 2.570393147375357, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6381237506866455, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7205531597137451, + "num_tokens": 605469430.0, + "step": 23406 + }, + { + "epoch": 2.5705029650779707, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2713279724121094, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6920316219329834, + "num_tokens": 605499226.0, + "step": 23407 + }, + { + "epoch": 2.570612782780584, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3236050605773926, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7054435610771179, + "num_tokens": 605527397.0, + "step": 23408 + }, + { + "epoch": 2.570722600483198, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3340423107147217, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7014053463935852, + "num_tokens": 605553009.0, + "step": 23409 + }, + { + "epoch": 2.5708324181858115, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8041090965270996, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7361186742782593, + "num_tokens": 605572554.0, + "step": 23410 + }, + { + "epoch": 2.5709422358884253, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.357621669769287, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7088348269462585, + "num_tokens": 605600291.0, + "step": 23411 + }, + { + "epoch": 2.571052053591039, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.41904354095459, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.756534218788147, + "num_tokens": 605624462.0, + "step": 23412 + }, + { + "epoch": 2.5711618712936524, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2601237297058105, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.720710813999176, + "num_tokens": 605654859.0, + "step": 23413 + }, + { + "epoch": 2.571271688996266, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6390275955200195, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7325316071510315, + "num_tokens": 605677494.0, + "step": 23414 + }, + { + "epoch": 2.57138150669888, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4323861598968506, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7106466293334961, + "num_tokens": 605703942.0, + "step": 23415 + }, + { + "epoch": 2.5714913244014936, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.591853380203247, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.691847026348114, + "num_tokens": 605728956.0, + "step": 23416 + }, + { + "epoch": 2.5716011421041074, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.568066120147705, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7363556027412415, + "num_tokens": 605752376.0, + "step": 23417 + }, + { + "epoch": 2.5717109598067207, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.634511947631836, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7094552516937256, + "num_tokens": 605776295.0, + "step": 23418 + }, + { + "epoch": 2.5718207775093345, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.507761001586914, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7394508123397827, + "num_tokens": 605798727.0, + "step": 23419 + }, + { + "epoch": 2.5719305952119482, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4926507472991943, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6919046640396118, + "num_tokens": 605830369.0, + "step": 23420 + }, + { + "epoch": 2.572040412914562, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.584064245223999, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7134143710136414, + "num_tokens": 605853381.0, + "step": 23421 + }, + { + "epoch": 2.5721502306171757, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.303208827972412, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7064394950866699, + "num_tokens": 605881397.0, + "step": 23422 + }, + { + "epoch": 2.572260048319789, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5148091316223145, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7112009525299072, + "num_tokens": 605905674.0, + "step": 23423 + }, + { + "epoch": 2.572369866022403, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.47676157951355, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7122772336006165, + "num_tokens": 605931293.0, + "step": 23424 + }, + { + "epoch": 2.5724796837250166, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.90299391746521, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7164342403411865, + "num_tokens": 605949840.0, + "step": 23425 + }, + { + "epoch": 2.57258950142763, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7219746112823486, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7395700216293335, + "num_tokens": 605969918.0, + "step": 23426 + }, + { + "epoch": 2.5726993191302436, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.493375301361084, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6889001727104187, + "num_tokens": 605997400.0, + "step": 23427 + }, + { + "epoch": 2.5728091368328574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.51857590675354, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6975772976875305, + "num_tokens": 606022978.0, + "step": 23428 + }, + { + "epoch": 2.572918954535471, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6708486080169678, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7457589507102966, + "num_tokens": 606043132.0, + "step": 23429 + }, + { + "epoch": 2.573028772238085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4948196411132812, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6952906250953674, + "num_tokens": 606070287.0, + "step": 23430 + }, + { + "epoch": 2.573138589940698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.242258310317993, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7086305022239685, + "num_tokens": 606100701.0, + "step": 23431 + }, + { + "epoch": 2.573248407643312, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3648996353149414, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7184306979179382, + "num_tokens": 606128149.0, + "step": 23432 + }, + { + "epoch": 2.5733582253459257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5513243675231934, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.704974889755249, + "num_tokens": 606150533.0, + "step": 23433 + }, + { + "epoch": 2.5734680430485395, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.731588363647461, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7514133453369141, + "num_tokens": 606169482.0, + "step": 23434 + }, + { + "epoch": 2.5735778607511532, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.327277660369873, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7215673923492432, + "num_tokens": 606195832.0, + "step": 23435 + }, + { + "epoch": 2.5736876784537666, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3037683963775635, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7135903835296631, + "num_tokens": 606225471.0, + "step": 23436 + }, + { + "epoch": 2.5737974961563803, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4165685176849365, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6988829970359802, + "num_tokens": 606251218.0, + "step": 23437 + }, + { + "epoch": 2.573907313858994, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1269900798797607, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6936330795288086, + "num_tokens": 606287133.0, + "step": 23438 + }, + { + "epoch": 2.574017131561608, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1295950412750244, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6998008489608765, + "num_tokens": 606319512.0, + "step": 23439 + }, + { + "epoch": 2.5741269492642216, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.129404067993164, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7158399224281311, + "num_tokens": 606351132.0, + "step": 23440 + }, + { + "epoch": 2.574236766966835, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.232077121734619, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7483152151107788, + "num_tokens": 606377231.0, + "step": 23441 + }, + { + "epoch": 2.5743465846694487, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.482329845428467, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7090908288955688, + "num_tokens": 606401545.0, + "step": 23442 + }, + { + "epoch": 2.5744564023720624, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.384068012237549, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7278540134429932, + "num_tokens": 606426396.0, + "step": 23443 + }, + { + "epoch": 2.574566220074676, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3829269409179688, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7234442234039307, + "num_tokens": 606455352.0, + "step": 23444 + }, + { + "epoch": 2.57467603777729, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.557175874710083, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7239810228347778, + "num_tokens": 606478255.0, + "step": 23445 + }, + { + "epoch": 2.5747858554799032, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5371737480163574, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.7006474137306213, + "num_tokens": 606502966.0, + "step": 23446 + }, + { + "epoch": 2.574895673182517, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8929009437561035, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7281954288482666, + "num_tokens": 606520752.0, + "step": 23447 + }, + { + "epoch": 2.5750054908851308, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.196742057800293, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7212405204772949, + "num_tokens": 606549079.0, + "step": 23448 + }, + { + "epoch": 2.575115308587744, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4990127086639404, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7200331091880798, + "num_tokens": 606571157.0, + "step": 23449 + }, + { + "epoch": 2.5752251262903583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.429703712463379, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7061984539031982, + "num_tokens": 606596777.0, + "step": 23450 + }, + { + "epoch": 2.5753349439929716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6423635482788086, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7159832119941711, + "num_tokens": 606619862.0, + "step": 23451 + }, + { + "epoch": 2.5754447616955853, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7309439182281494, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7164896130561829, + "num_tokens": 606643526.0, + "step": 23452 + }, + { + "epoch": 2.575554579398199, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6765668392181396, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7271915674209595, + "num_tokens": 606665192.0, + "step": 23453 + }, + { + "epoch": 2.5756643971008124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.315115451812744, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6962572932243347, + "num_tokens": 606694666.0, + "step": 23454 + }, + { + "epoch": 2.575774214803426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.428272008895874, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7017948627471924, + "num_tokens": 606721178.0, + "step": 23455 + }, + { + "epoch": 2.57588403250604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.276061534881592, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7163653373718262, + "num_tokens": 606751290.0, + "step": 23456 + }, + { + "epoch": 2.5759938502086537, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5997166633605957, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.7034249305725098, + "num_tokens": 606777418.0, + "step": 23457 + }, + { + "epoch": 2.5761036679112674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5327365398406982, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7181563377380371, + "num_tokens": 606802705.0, + "step": 23458 + }, + { + "epoch": 2.5762134856138807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.500068426132202, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7259368896484375, + "num_tokens": 606826176.0, + "step": 23459 + }, + { + "epoch": 2.5763233033164945, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5770535469055176, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7393661141395569, + "num_tokens": 606851494.0, + "step": 23460 + }, + { + "epoch": 2.5764331210191083, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2664945125579834, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7021247148513794, + "num_tokens": 606880711.0, + "step": 23461 + }, + { + "epoch": 2.576542938721722, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.319878578186035, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6994366645812988, + "num_tokens": 606909157.0, + "step": 23462 + }, + { + "epoch": 2.5766527564243358, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5509560108184814, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7076926827430725, + "num_tokens": 606934181.0, + "step": 23463 + }, + { + "epoch": 2.576762574126949, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2366909980773926, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6860957145690918, + "num_tokens": 606967102.0, + "step": 23464 + }, + { + "epoch": 2.576872391829563, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2945477962493896, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7286359071731567, + "num_tokens": 606994772.0, + "step": 23465 + }, + { + "epoch": 2.5769822095321766, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2882192134857178, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6906741857528687, + "num_tokens": 607024344.0, + "step": 23466 + }, + { + "epoch": 2.5770920272347904, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.729997158050537, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7142416834831238, + "num_tokens": 607045444.0, + "step": 23467 + }, + { + "epoch": 2.577201844937404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3324694633483887, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7280834317207336, + "num_tokens": 607072488.0, + "step": 23468 + }, + { + "epoch": 2.5773116626400174, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6555898189544678, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6997950077056885, + "num_tokens": 607095180.0, + "step": 23469 + }, + { + "epoch": 2.577421480342631, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.546360969543457, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7294552326202393, + "num_tokens": 607118068.0, + "step": 23470 + }, + { + "epoch": 2.577531298045245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.201342821121216, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7095935940742493, + "num_tokens": 607151100.0, + "step": 23471 + }, + { + "epoch": 2.5776411157478587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.432551622390747, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.7111067771911621, + "num_tokens": 607176109.0, + "step": 23472 + }, + { + "epoch": 2.5777509334504725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.523463487625122, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.728268027305603, + "num_tokens": 607200271.0, + "step": 23473 + }, + { + "epoch": 2.5778607511530858, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.329315185546875, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6961754560470581, + "num_tokens": 607231627.0, + "step": 23474 + }, + { + "epoch": 2.5779705688556995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3220138549804688, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7174197435379028, + "num_tokens": 607261567.0, + "step": 23475 + }, + { + "epoch": 2.5780803865583133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.533637285232544, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7231855392456055, + "num_tokens": 607283953.0, + "step": 23476 + }, + { + "epoch": 2.5781902042609266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2874977588653564, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6845660209655762, + "num_tokens": 607314055.0, + "step": 23477 + }, + { + "epoch": 2.578300021963541, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2734880447387695, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7222282886505127, + "num_tokens": 607341826.0, + "step": 23478 + }, + { + "epoch": 2.578409839666154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.563438653945923, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7312241196632385, + "num_tokens": 607362328.0, + "step": 23479 + }, + { + "epoch": 2.578519657368768, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6594724655151367, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7156758308410645, + "num_tokens": 607385401.0, + "step": 23480 + }, + { + "epoch": 2.5786294750713816, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.311414957046509, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7161558270454407, + "num_tokens": 607411585.0, + "step": 23481 + }, + { + "epoch": 2.578739292773995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5565567016601562, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7051602602005005, + "num_tokens": 607438405.0, + "step": 23482 + }, + { + "epoch": 2.5788491104766087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4102392196655273, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7258475422859192, + "num_tokens": 607464513.0, + "step": 23483 + }, + { + "epoch": 2.5789589281792225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5206682682037354, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7244945764541626, + "num_tokens": 607489036.0, + "step": 23484 + }, + { + "epoch": 2.579068745881836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.279883623123169, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7111577987670898, + "num_tokens": 607518788.0, + "step": 23485 + }, + { + "epoch": 2.57917856358445, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.268080949783325, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7258926630020142, + "num_tokens": 607545199.0, + "step": 23486 + }, + { + "epoch": 2.5792883812870633, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3781943321228027, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6998546719551086, + "num_tokens": 607571705.0, + "step": 23487 + }, + { + "epoch": 2.579398198989677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.266411304473877, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7273401021957397, + "num_tokens": 607600658.0, + "step": 23488 + }, + { + "epoch": 2.579508016692291, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5521011352539062, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7294514179229736, + "num_tokens": 607622523.0, + "step": 23489 + }, + { + "epoch": 2.5796178343949046, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.388472557067871, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7369865775108337, + "num_tokens": 607649235.0, + "step": 23490 + }, + { + "epoch": 2.5797276520975183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.408662796020508, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7202223539352417, + "num_tokens": 607674183.0, + "step": 23491 + }, + { + "epoch": 2.5798374698001316, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.639946937561035, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7014310359954834, + "num_tokens": 607697653.0, + "step": 23492 + }, + { + "epoch": 2.5799472875027454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.420567750930786, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7164376974105835, + "num_tokens": 607720883.0, + "step": 23493 + }, + { + "epoch": 2.580057105205359, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.634530544281006, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7249622941017151, + "num_tokens": 607742152.0, + "step": 23494 + }, + { + "epoch": 2.580166922907973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4662740230560303, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7170335054397583, + "num_tokens": 607768335.0, + "step": 23495 + }, + { + "epoch": 2.5802767406105866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.442882776260376, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7098479866981506, + "num_tokens": 607795566.0, + "step": 23496 + }, + { + "epoch": 2.5803865583132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4340438842773438, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6976090669631958, + "num_tokens": 607825518.0, + "step": 23497 + }, + { + "epoch": 2.5804963760158137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.392822504043579, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7234442234039307, + "num_tokens": 607850631.0, + "step": 23498 + }, + { + "epoch": 2.5806061937184275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.189396381378174, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6849759221076965, + "num_tokens": 607880161.0, + "step": 23499 + }, + { + "epoch": 2.5807160114210412, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0853703022003174, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6929362416267395, + "num_tokens": 607912535.0, + "step": 23500 + }, + { + "epoch": 2.580825829123655, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1284120082855225, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.712802529335022, + "num_tokens": 607949154.0, + "step": 23501 + }, + { + "epoch": 2.5809356468262683, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2847256660461426, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6998718976974487, + "num_tokens": 607978259.0, + "step": 23502 + }, + { + "epoch": 2.581045464528882, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6714835166931152, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7105861306190491, + "num_tokens": 607999828.0, + "step": 23503 + }, + { + "epoch": 2.581155282231496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5297675132751465, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7073995471000671, + "num_tokens": 608026370.0, + "step": 23504 + }, + { + "epoch": 2.581265099934109, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4264252185821533, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7070631980895996, + "num_tokens": 608052186.0, + "step": 23505 + }, + { + "epoch": 2.581374917636723, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.751295328140259, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7380159497261047, + "num_tokens": 608071921.0, + "step": 23506 + }, + { + "epoch": 2.5814847353393366, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.818203926086426, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7182234525680542, + "num_tokens": 608094605.0, + "step": 23507 + }, + { + "epoch": 2.5815945530419504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4306468963623047, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6861777305603027, + "num_tokens": 608121527.0, + "step": 23508 + }, + { + "epoch": 2.581704370744564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.542228937149048, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7072916030883789, + "num_tokens": 608148738.0, + "step": 23509 + }, + { + "epoch": 2.5818141884471775, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.272765636444092, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.7014048099517822, + "num_tokens": 608178076.0, + "step": 23510 + }, + { + "epoch": 2.5819240061497912, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.646859884262085, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7361688017845154, + "num_tokens": 608200723.0, + "step": 23511 + }, + { + "epoch": 2.582033823852405, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6198103427886963, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7228468656539917, + "num_tokens": 608223552.0, + "step": 23512 + }, + { + "epoch": 2.5821436415550187, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4080734252929688, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7034392356872559, + "num_tokens": 608248956.0, + "step": 23513 + }, + { + "epoch": 2.5822534592576325, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1806294918060303, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7155189514160156, + "num_tokens": 608281144.0, + "step": 23514 + }, + { + "epoch": 2.582363276960246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.155872344970703, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7220754623413086, + "num_tokens": 608298919.0, + "step": 23515 + }, + { + "epoch": 2.5824730946628596, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7936627864837646, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7479843497276306, + "num_tokens": 608317932.0, + "step": 23516 + }, + { + "epoch": 2.5825829123654733, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.458857297897339, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7137739658355713, + "num_tokens": 608343042.0, + "step": 23517 + }, + { + "epoch": 2.582692730068087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4041736125946045, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7459238767623901, + "num_tokens": 608366394.0, + "step": 23518 + }, + { + "epoch": 2.582802547770701, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5143327713012695, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7038900852203369, + "num_tokens": 608391877.0, + "step": 23519 + }, + { + "epoch": 2.582912365473314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2881522178649902, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6987658739089966, + "num_tokens": 608422930.0, + "step": 23520 + }, + { + "epoch": 2.583022183175928, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3261172771453857, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6840044260025024, + "num_tokens": 608452170.0, + "step": 23521 + }, + { + "epoch": 2.5831320008785417, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.62416672706604, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7289627194404602, + "num_tokens": 608474138.0, + "step": 23522 + }, + { + "epoch": 2.5832418185811554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.310159683227539, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7305203080177307, + "num_tokens": 608500710.0, + "step": 23523 + }, + { + "epoch": 2.583351636283769, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.458191156387329, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7289146184921265, + "num_tokens": 608526318.0, + "step": 23524 + }, + { + "epoch": 2.5834614539863825, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6760807037353516, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.718180775642395, + "num_tokens": 608549646.0, + "step": 23525 + }, + { + "epoch": 2.5835712716889963, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5788707733154297, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6967415809631348, + "num_tokens": 608576690.0, + "step": 23526 + }, + { + "epoch": 2.58368108939161, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3128609657287598, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7033673524856567, + "num_tokens": 608605824.0, + "step": 23527 + }, + { + "epoch": 2.5837909070942233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4783732891082764, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7129735946655273, + "num_tokens": 608631410.0, + "step": 23528 + }, + { + "epoch": 2.5839007247968375, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.40779972076416, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7105264067649841, + "num_tokens": 608659695.0, + "step": 23529 + }, + { + "epoch": 2.584010542499451, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.392744302749634, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7060452699661255, + "num_tokens": 608684659.0, + "step": 23530 + }, + { + "epoch": 2.5841203602020646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.35359787940979, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7143723964691162, + "num_tokens": 608712998.0, + "step": 23531 + }, + { + "epoch": 2.5842301779046783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.166578769683838, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7063735127449036, + "num_tokens": 608742565.0, + "step": 23532 + }, + { + "epoch": 2.5843399956072917, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.198452949523926, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7222257256507874, + "num_tokens": 608772595.0, + "step": 23533 + }, + { + "epoch": 2.5844498133099054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495292901992798, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6963931322097778, + "num_tokens": 608800380.0, + "step": 23534 + }, + { + "epoch": 2.584559631012519, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.50596284866333, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6941105127334595, + "num_tokens": 608826483.0, + "step": 23535 + }, + { + "epoch": 2.584669448715133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6319522857666016, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7041386961936951, + "num_tokens": 608851677.0, + "step": 23536 + }, + { + "epoch": 2.5847792664177467, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.415292739868164, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.711550235748291, + "num_tokens": 608877458.0, + "step": 23537 + }, + { + "epoch": 2.58488908412036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4434261322021484, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7311359643936157, + "num_tokens": 608903584.0, + "step": 23538 + }, + { + "epoch": 2.5849989018229738, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3700573444366455, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7117940783500671, + "num_tokens": 608931746.0, + "step": 23539 + }, + { + "epoch": 2.5851087195255875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.037585496902466, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6874655485153198, + "num_tokens": 608960365.0, + "step": 23540 + }, + { + "epoch": 2.5852185372282013, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3226237297058105, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.7072902917861938, + "num_tokens": 608990885.0, + "step": 23541 + }, + { + "epoch": 2.585328354930815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.253497362136841, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7128719091415405, + "num_tokens": 609018899.0, + "step": 23542 + }, + { + "epoch": 2.5854381726334283, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5550878047943115, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7069735527038574, + "num_tokens": 609042820.0, + "step": 23543 + }, + { + "epoch": 2.585547990336042, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.335099697113037, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7110334038734436, + "num_tokens": 609071626.0, + "step": 23544 + }, + { + "epoch": 2.585657808038656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5991063117980957, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.706321120262146, + "num_tokens": 609094227.0, + "step": 23545 + }, + { + "epoch": 2.5857676257412696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2029547691345215, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7083621025085449, + "num_tokens": 609125138.0, + "step": 23546 + }, + { + "epoch": 2.5858774434438834, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3391339778900146, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.733413577079773, + "num_tokens": 609153763.0, + "step": 23547 + }, + { + "epoch": 2.5859872611464967, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4045395851135254, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7084723711013794, + "num_tokens": 609181468.0, + "step": 23548 + }, + { + "epoch": 2.5860970788491104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.682727813720703, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.733792781829834, + "num_tokens": 609203204.0, + "step": 23549 + }, + { + "epoch": 2.586206896551724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.595515727996826, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7283707857131958, + "num_tokens": 609226339.0, + "step": 23550 + }, + { + "epoch": 2.586316714254338, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5569281578063965, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7064278721809387, + "num_tokens": 609251273.0, + "step": 23551 + }, + { + "epoch": 2.5864265319569517, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5367422103881836, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7110599279403687, + "num_tokens": 609275377.0, + "step": 23552 + }, + { + "epoch": 2.586536349659565, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4802348613739014, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7125449776649475, + "num_tokens": 609302709.0, + "step": 23553 + }, + { + "epoch": 2.586646167362179, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.567157030105591, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6987990736961365, + "num_tokens": 609328684.0, + "step": 23554 + }, + { + "epoch": 2.5867559850647925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.218505382537842, + "learning_rate": 1e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.6812993288040161, + "num_tokens": 609358995.0, + "step": 23555 + }, + { + "epoch": 2.586865802767406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5223751068115234, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7033770084381104, + "num_tokens": 609383638.0, + "step": 23556 + }, + { + "epoch": 2.5869756204700196, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6299164295196533, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7027466297149658, + "num_tokens": 609406471.0, + "step": 23557 + }, + { + "epoch": 2.5870854381726334, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4955921173095703, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.698914110660553, + "num_tokens": 609432105.0, + "step": 23558 + }, + { + "epoch": 2.587195255875247, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4130706787109375, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.698175311088562, + "num_tokens": 609460303.0, + "step": 23559 + }, + { + "epoch": 2.587305073577861, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5812790393829346, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6956529021263123, + "num_tokens": 609482805.0, + "step": 23560 + }, + { + "epoch": 2.587414891280474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.511831760406494, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7228455543518066, + "num_tokens": 609508378.0, + "step": 23561 + }, + { + "epoch": 2.587524708983088, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5485355854034424, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7124972939491272, + "num_tokens": 609532991.0, + "step": 23562 + }, + { + "epoch": 2.5876345266857017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.193673849105835, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6929305791854858, + "num_tokens": 609562786.0, + "step": 23563 + }, + { + "epoch": 2.5877443443883155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5201897621154785, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7427026033401489, + "num_tokens": 609584798.0, + "step": 23564 + }, + { + "epoch": 2.587854162090929, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3843517303466797, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7167134284973145, + "num_tokens": 609613866.0, + "step": 23565 + }, + { + "epoch": 2.5879639797935425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.47516131401062, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7459467053413391, + "num_tokens": 609637725.0, + "step": 23566 + }, + { + "epoch": 2.5880737974961563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.271172285079956, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7163773775100708, + "num_tokens": 609666176.0, + "step": 23567 + }, + { + "epoch": 2.58818361519877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.583987236022949, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7217344641685486, + "num_tokens": 609688773.0, + "step": 23568 + }, + { + "epoch": 2.588293432901384, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6136977672576904, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7166742086410522, + "num_tokens": 609713472.0, + "step": 23569 + }, + { + "epoch": 2.5884032506039976, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3281970024108887, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.730185866355896, + "num_tokens": 609740360.0, + "step": 23570 + }, + { + "epoch": 2.588513068306611, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.507814645767212, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.723429799079895, + "num_tokens": 609767501.0, + "step": 23571 + }, + { + "epoch": 2.5886228860092246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5618391036987305, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.723452091217041, + "num_tokens": 609791624.0, + "step": 23572 + }, + { + "epoch": 2.5887327037118384, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.741791009902954, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7399003505706787, + "num_tokens": 609811024.0, + "step": 23573 + }, + { + "epoch": 2.588842521414452, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4013261795043945, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7252800464630127, + "num_tokens": 609835938.0, + "step": 23574 + }, + { + "epoch": 2.588952339117066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2244279384613037, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7120803594589233, + "num_tokens": 609865852.0, + "step": 23575 + }, + { + "epoch": 2.589062156819679, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4004220962524414, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7067849636077881, + "num_tokens": 609892294.0, + "step": 23576 + }, + { + "epoch": 2.589171974522293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.361783742904663, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7135604023933411, + "num_tokens": 609918345.0, + "step": 23577 + }, + { + "epoch": 2.5892817922249067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6219863891601562, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7113417983055115, + "num_tokens": 609943155.0, + "step": 23578 + }, + { + "epoch": 2.58939160992752, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.456998825073242, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7415164113044739, + "num_tokens": 609967669.0, + "step": 23579 + }, + { + "epoch": 2.5895014276301342, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.173121690750122, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7074887156486511, + "num_tokens": 609999713.0, + "step": 23580 + }, + { + "epoch": 2.5896112453327476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2569050788879395, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.697089433670044, + "num_tokens": 610029806.0, + "step": 23581 + }, + { + "epoch": 2.5897210630353613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.244636297225952, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.689798891544342, + "num_tokens": 610060118.0, + "step": 23582 + }, + { + "epoch": 2.589830880737975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.534851551055908, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.705036461353302, + "num_tokens": 610086735.0, + "step": 23583 + }, + { + "epoch": 2.5899406984405884, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2943971157073975, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.693631649017334, + "num_tokens": 610115401.0, + "step": 23584 + }, + { + "epoch": 2.590050516143202, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6277449131011963, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7255908250808716, + "num_tokens": 610138102.0, + "step": 23585 + }, + { + "epoch": 2.590160333845816, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.201219081878662, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7036426663398743, + "num_tokens": 610169734.0, + "step": 23586 + }, + { + "epoch": 2.5902701515484297, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4426932334899902, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7088760137557983, + "num_tokens": 610194219.0, + "step": 23587 + }, + { + "epoch": 2.5903799692510434, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5447914600372314, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6893799304962158, + "num_tokens": 610218464.0, + "step": 23588 + }, + { + "epoch": 2.5904897869536567, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4859132766723633, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7186092138290405, + "num_tokens": 610244318.0, + "step": 23589 + }, + { + "epoch": 2.5905996046562705, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7765002250671387, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7121152877807617, + "num_tokens": 610265950.0, + "step": 23590 + }, + { + "epoch": 2.5907094223588842, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5377135276794434, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7132449150085449, + "num_tokens": 610289450.0, + "step": 23591 + }, + { + "epoch": 2.590819240061498, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3059580326080322, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.712244987487793, + "num_tokens": 610318546.0, + "step": 23592 + }, + { + "epoch": 2.5909290577641118, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3769583702087402, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7140057682991028, + "num_tokens": 610344578.0, + "step": 23593 + }, + { + "epoch": 2.591038875466725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6341443061828613, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.729692280292511, + "num_tokens": 610368562.0, + "step": 23594 + }, + { + "epoch": 2.591148693169339, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4350318908691406, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7147146463394165, + "num_tokens": 610393880.0, + "step": 23595 + }, + { + "epoch": 2.5912585108719526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.122053623199463, + "learning_rate": 1e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6807683706283569, + "num_tokens": 610427393.0, + "step": 23596 + }, + { + "epoch": 2.5913683285745663, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.725823402404785, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7312715649604797, + "num_tokens": 610449326.0, + "step": 23597 + }, + { + "epoch": 2.59147814627718, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4344699382781982, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7017401456832886, + "num_tokens": 610477099.0, + "step": 23598 + }, + { + "epoch": 2.5915879639797934, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.573552370071411, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7016212940216064, + "num_tokens": 610502104.0, + "step": 23599 + }, + { + "epoch": 2.591697781682407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.470949172973633, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7063499689102173, + "num_tokens": 610526704.0, + "step": 23600 + }, + { + "epoch": 2.591807599385021, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4738833904266357, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7138315439224243, + "num_tokens": 610552280.0, + "step": 23601 + }, + { + "epoch": 2.5919174170876347, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9526498317718506, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7322004437446594, + "num_tokens": 610570278.0, + "step": 23602 + }, + { + "epoch": 2.5920272347902484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4547791481018066, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7075064778327942, + "num_tokens": 610594914.0, + "step": 23603 + }, + { + "epoch": 2.5921370524928617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5304999351501465, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7322924733161926, + "num_tokens": 610618667.0, + "step": 23604 + }, + { + "epoch": 2.5922468701954755, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5012519359588623, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7314530611038208, + "num_tokens": 610641748.0, + "step": 23605 + }, + { + "epoch": 2.5923566878980893, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1110622882843018, + "learning_rate": 1e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.6762390732765198, + "num_tokens": 610678830.0, + "step": 23606 + }, + { + "epoch": 2.5924665056007026, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2610931396484375, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7354649305343628, + "num_tokens": 610706433.0, + "step": 23607 + }, + { + "epoch": 2.5925763233033163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.380380868911743, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7125260829925537, + "num_tokens": 610732306.0, + "step": 23608 + }, + { + "epoch": 2.59268614100593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.595510482788086, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7209389209747314, + "num_tokens": 610755381.0, + "step": 23609 + }, + { + "epoch": 2.592795958708544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6612534523010254, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6962393522262573, + "num_tokens": 610779631.0, + "step": 23610 + }, + { + "epoch": 2.5929057764111576, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4078729152679443, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.686056911945343, + "num_tokens": 610807265.0, + "step": 23611 + }, + { + "epoch": 2.593015594113771, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.168276071548462, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7179797291755676, + "num_tokens": 610837654.0, + "step": 23612 + }, + { + "epoch": 2.5931254118163847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3072891235351562, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.727934718132019, + "num_tokens": 610863735.0, + "step": 23613 + }, + { + "epoch": 2.5932352295189984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.216702938079834, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7142418026924133, + "num_tokens": 610894086.0, + "step": 23614 + }, + { + "epoch": 2.593345047221612, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6149842739105225, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7174720168113708, + "num_tokens": 610916455.0, + "step": 23615 + }, + { + "epoch": 2.593454864924226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6756560802459717, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.7086066007614136, + "num_tokens": 610939189.0, + "step": 23616 + }, + { + "epoch": 2.5935646826268393, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.597602367401123, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7268096208572388, + "num_tokens": 610962545.0, + "step": 23617 + }, + { + "epoch": 2.593674500329453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.0701029300689697, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7202857732772827, + "num_tokens": 610979802.0, + "step": 23618 + }, + { + "epoch": 2.5937843180320668, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4083657264709473, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7118692398071289, + "num_tokens": 611007790.0, + "step": 23619 + }, + { + "epoch": 2.5938941357346805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7290446758270264, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7077130079269409, + "num_tokens": 611028766.0, + "step": 23620 + }, + { + "epoch": 2.5940039534372943, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2908754348754883, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6963323354721069, + "num_tokens": 611060140.0, + "step": 23621 + }, + { + "epoch": 2.5941137711399076, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.501864194869995, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7184706926345825, + "num_tokens": 611084267.0, + "step": 23622 + }, + { + "epoch": 2.5942235888425214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4065158367156982, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.7014563083648682, + "num_tokens": 611110761.0, + "step": 23623 + }, + { + "epoch": 2.594333406545135, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4123167991638184, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6867562532424927, + "num_tokens": 611136743.0, + "step": 23624 + }, + { + "epoch": 2.594443224247749, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5471715927124023, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7155935764312744, + "num_tokens": 611162697.0, + "step": 23625 + }, + { + "epoch": 2.5945530419503626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3352057933807373, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7065537571907043, + "num_tokens": 611190809.0, + "step": 23626 + }, + { + "epoch": 2.594662859652976, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.344959259033203, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6844044923782349, + "num_tokens": 611219321.0, + "step": 23627 + }, + { + "epoch": 2.5947726773555897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7571654319763184, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7183300256729126, + "num_tokens": 611240407.0, + "step": 23628 + }, + { + "epoch": 2.5948824950582035, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2866761684417725, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7087728381156921, + "num_tokens": 611269693.0, + "step": 23629 + }, + { + "epoch": 2.594992312760817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4462971687316895, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6905816793441772, + "num_tokens": 611294073.0, + "step": 23630 + }, + { + "epoch": 2.595102130463431, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3144400119781494, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7068823575973511, + "num_tokens": 611322370.0, + "step": 23631 + }, + { + "epoch": 2.5952119481660443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4763474464416504, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7019383907318115, + "num_tokens": 611346265.0, + "step": 23632 + }, + { + "epoch": 2.595321765868658, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3811659812927246, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7152886986732483, + "num_tokens": 611371607.0, + "step": 23633 + }, + { + "epoch": 2.595431583571272, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.176823616027832, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7323932647705078, + "num_tokens": 611399678.0, + "step": 23634 + }, + { + "epoch": 2.595541401273885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3063886165618896, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7114900946617126, + "num_tokens": 611428886.0, + "step": 23635 + }, + { + "epoch": 2.595651218976499, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.588369369506836, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7080572247505188, + "num_tokens": 611451301.0, + "step": 23636 + }, + { + "epoch": 2.5957610366791126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3525044918060303, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7128320932388306, + "num_tokens": 611478880.0, + "step": 23637 + }, + { + "epoch": 2.5958708543817264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3660709857940674, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7100232243537903, + "num_tokens": 611506458.0, + "step": 23638 + }, + { + "epoch": 2.59598067208434, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3850443363189697, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7305251359939575, + "num_tokens": 611532584.0, + "step": 23639 + }, + { + "epoch": 2.5960904897869534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.9810914993286133, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7191329598426819, + "num_tokens": 611568513.0, + "step": 23640 + }, + { + "epoch": 2.596200307489567, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5705409049987793, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6969387531280518, + "num_tokens": 611595791.0, + "step": 23641 + }, + { + "epoch": 2.596310125192181, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3636059761047363, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.7007606029510498, + "num_tokens": 611623092.0, + "step": 23642 + }, + { + "epoch": 2.5964199428947947, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.562328815460205, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7192944884300232, + "num_tokens": 611646677.0, + "step": 23643 + }, + { + "epoch": 2.5965297605974085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3992416858673096, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7200080156326294, + "num_tokens": 611672403.0, + "step": 23644 + }, + { + "epoch": 2.596639578300022, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.308992624282837, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7175858020782471, + "num_tokens": 611702719.0, + "step": 23645 + }, + { + "epoch": 2.5967493960026355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.9875867366790771, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6919347047805786, + "num_tokens": 611740960.0, + "step": 23646 + }, + { + "epoch": 2.5968592137052493, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3218131065368652, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7174882888793945, + "num_tokens": 611770401.0, + "step": 23647 + }, + { + "epoch": 2.596969031407863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5555026531219482, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7306884527206421, + "num_tokens": 611791834.0, + "step": 23648 + }, + { + "epoch": 2.597078849110477, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4801743030548096, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7124837636947632, + "num_tokens": 611818006.0, + "step": 23649 + }, + { + "epoch": 2.59718866681309, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8858604431152344, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.71605384349823, + "num_tokens": 611836610.0, + "step": 23650 + }, + { + "epoch": 2.597298484515704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.309863567352295, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6977124214172363, + "num_tokens": 611868349.0, + "step": 23651 + }, + { + "epoch": 2.5974083022183176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.672150135040283, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7363584041595459, + "num_tokens": 611888622.0, + "step": 23652 + }, + { + "epoch": 2.5975181199209314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2085459232330322, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.730999231338501, + "num_tokens": 611919472.0, + "step": 23653 + }, + { + "epoch": 2.597627937623545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1796762943267822, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7129349112510681, + "num_tokens": 611949302.0, + "step": 23654 + }, + { + "epoch": 2.5977377553261585, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3451521396636963, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7190511226654053, + "num_tokens": 611977106.0, + "step": 23655 + }, + { + "epoch": 2.5978475730287722, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.488865613937378, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7166294455528259, + "num_tokens": 612002776.0, + "step": 23656 + }, + { + "epoch": 2.597957390731386, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426297664642334, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6981721520423889, + "num_tokens": 612028747.0, + "step": 23657 + }, + { + "epoch": 2.5980672084339993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7009975910186768, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7281437516212463, + "num_tokens": 612051398.0, + "step": 23658 + }, + { + "epoch": 2.5981770261366135, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.523098945617676, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7198121547698975, + "num_tokens": 612076770.0, + "step": 23659 + }, + { + "epoch": 2.598286843839227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.427889347076416, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7241190671920776, + "num_tokens": 612100720.0, + "step": 23660 + }, + { + "epoch": 2.5983966615418406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5026180744171143, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7349089980125427, + "num_tokens": 612124654.0, + "step": 23661 + }, + { + "epoch": 2.5985064792444543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.797384738922119, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7442699670791626, + "num_tokens": 612144913.0, + "step": 23662 + }, + { + "epoch": 2.5986162969470676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.326422929763794, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7361673712730408, + "num_tokens": 612171849.0, + "step": 23663 + }, + { + "epoch": 2.5987261146496814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.27103590965271, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6929655075073242, + "num_tokens": 612203713.0, + "step": 23664 + }, + { + "epoch": 2.598835932352295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2378957271575928, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7152010798454285, + "num_tokens": 612232019.0, + "step": 23665 + }, + { + "epoch": 2.598945750054909, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.67970609664917, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7126073241233826, + "num_tokens": 612254444.0, + "step": 23666 + }, + { + "epoch": 2.5990555677575227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3658676147460938, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7370036840438843, + "num_tokens": 612280856.0, + "step": 23667 + }, + { + "epoch": 2.599165385460136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5074563026428223, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7086819410324097, + "num_tokens": 612304793.0, + "step": 23668 + }, + { + "epoch": 2.5992752031627497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.604407548904419, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6969130039215088, + "num_tokens": 612330615.0, + "step": 23669 + }, + { + "epoch": 2.5993850208653635, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3990426063537598, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7274043560028076, + "num_tokens": 612356540.0, + "step": 23670 + }, + { + "epoch": 2.5994948385679773, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7097628116607666, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7216808199882507, + "num_tokens": 612378341.0, + "step": 23671 + }, + { + "epoch": 2.599604656270591, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.341095209121704, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7045725584030151, + "num_tokens": 612409321.0, + "step": 23672 + }, + { + "epoch": 2.5997144739732043, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4305837154388428, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6896673440933228, + "num_tokens": 612436828.0, + "step": 23673 + }, + { + "epoch": 2.599824291675818, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5034210681915283, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.708729088306427, + "num_tokens": 612461194.0, + "step": 23674 + }, + { + "epoch": 2.599934109378432, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.598219633102417, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7112579345703125, + "num_tokens": 612484439.0, + "step": 23675 + }, + { + "epoch": 2.6000439270810456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3841965198516846, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7196047902107239, + "num_tokens": 612510719.0, + "step": 23676 + }, + { + "epoch": 2.6001537447836593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5571489334106445, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7277725338935852, + "num_tokens": 612531870.0, + "step": 23677 + }, + { + "epoch": 2.6002635624862727, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5159850120544434, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7239665985107422, + "num_tokens": 612559693.0, + "step": 23678 + }, + { + "epoch": 2.6003733801888864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.633125066757202, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7160140872001648, + "num_tokens": 612582916.0, + "step": 23679 + }, + { + "epoch": 2.6004831978915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.584841012954712, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7156616449356079, + "num_tokens": 612607430.0, + "step": 23680 + }, + { + "epoch": 2.600593015594114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5599889755249023, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7151989936828613, + "num_tokens": 612632203.0, + "step": 23681 + }, + { + "epoch": 2.6007028332967277, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2686855792999268, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6866758465766907, + "num_tokens": 612663563.0, + "step": 23682 + }, + { + "epoch": 2.600812650999341, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5703179836273193, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7192362546920776, + "num_tokens": 612687162.0, + "step": 23683 + }, + { + "epoch": 2.6009224687019548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.894831657409668, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7080637216567993, + "num_tokens": 612708150.0, + "step": 23684 + }, + { + "epoch": 2.6010322864045685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5979089736938477, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.700718879699707, + "num_tokens": 612732273.0, + "step": 23685 + }, + { + "epoch": 2.601142104107182, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.208101749420166, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.703938364982605, + "num_tokens": 612762367.0, + "step": 23686 + }, + { + "epoch": 2.6012519218097956, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7611019611358643, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7259207963943481, + "num_tokens": 612784479.0, + "step": 23687 + }, + { + "epoch": 2.6013617395124093, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5186586380004883, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7391998767852783, + "num_tokens": 612807784.0, + "step": 23688 + }, + { + "epoch": 2.601471557215023, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5910871028900146, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.720918595790863, + "num_tokens": 612830441.0, + "step": 23689 + }, + { + "epoch": 2.601581374917637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1227636337280273, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7364215850830078, + "num_tokens": 612860287.0, + "step": 23690 + }, + { + "epoch": 2.60169119262025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6074419021606445, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7052605152130127, + "num_tokens": 612885853.0, + "step": 23691 + }, + { + "epoch": 2.601801010322864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469027042388916, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7065568566322327, + "num_tokens": 612913970.0, + "step": 23692 + }, + { + "epoch": 2.6019108280254777, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.624464750289917, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7271850109100342, + "num_tokens": 612936848.0, + "step": 23693 + }, + { + "epoch": 2.6020206457280914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.37583327293396, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7094043493270874, + "num_tokens": 612962683.0, + "step": 23694 + }, + { + "epoch": 2.602130463430705, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.294377565383911, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7367316484451294, + "num_tokens": 612990744.0, + "step": 23695 + }, + { + "epoch": 2.6022402811333185, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376479148864746, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7242469787597656, + "num_tokens": 613017058.0, + "step": 23696 + }, + { + "epoch": 2.6023500988359323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6595277786254883, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7303268909454346, + "num_tokens": 613038577.0, + "step": 23697 + }, + { + "epoch": 2.602459916538546, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.574263572692871, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.700954020023346, + "num_tokens": 613067846.0, + "step": 23698 + }, + { + "epoch": 2.60256973424116, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68054461479187, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7078183889389038, + "num_tokens": 613089819.0, + "step": 23699 + }, + { + "epoch": 2.6026795519437735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.615694761276245, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7157476544380188, + "num_tokens": 613111268.0, + "step": 23700 + }, + { + "epoch": 2.602789369646387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417705535888672, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7201433777809143, + "num_tokens": 613138421.0, + "step": 23701 + }, + { + "epoch": 2.6028991873490006, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.664466142654419, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6919888257980347, + "num_tokens": 613163254.0, + "step": 23702 + }, + { + "epoch": 2.6030090050516144, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.298327922821045, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7066600322723389, + "num_tokens": 613196287.0, + "step": 23703 + }, + { + "epoch": 2.603118822754228, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7666730880737305, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7043207287788391, + "num_tokens": 613220044.0, + "step": 23704 + }, + { + "epoch": 2.603228640456842, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7231948375701904, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.702004075050354, + "num_tokens": 613241739.0, + "step": 23705 + }, + { + "epoch": 2.603338458159455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3474366664886475, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6866005659103394, + "num_tokens": 613269674.0, + "step": 23706 + }, + { + "epoch": 2.603448275862069, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4311740398406982, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7135217189788818, + "num_tokens": 613299200.0, + "step": 23707 + }, + { + "epoch": 2.6035580935646827, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2989845275878906, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7309554815292358, + "num_tokens": 613326806.0, + "step": 23708 + }, + { + "epoch": 2.603667911267296, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.063317060470581, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6880427598953247, + "num_tokens": 613363218.0, + "step": 23709 + }, + { + "epoch": 2.60377772896991, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2417690753936768, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7376059293746948, + "num_tokens": 613392091.0, + "step": 23710 + }, + { + "epoch": 2.6038875466725235, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.477318286895752, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7344437837600708, + "num_tokens": 613415949.0, + "step": 23711 + }, + { + "epoch": 2.6039973643751373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4085843563079834, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7040794491767883, + "num_tokens": 613442712.0, + "step": 23712 + }, + { + "epoch": 2.604107182077751, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.522764205932617, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7051218748092651, + "num_tokens": 613468117.0, + "step": 23713 + }, + { + "epoch": 2.6042169997803644, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2654459476470947, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6911474466323853, + "num_tokens": 613497539.0, + "step": 23714 + }, + { + "epoch": 2.604326817482978, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3424324989318848, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7208113670349121, + "num_tokens": 613522873.0, + "step": 23715 + }, + { + "epoch": 2.604436635185592, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.478480339050293, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6866089105606079, + "num_tokens": 613549805.0, + "step": 23716 + }, + { + "epoch": 2.6045464528882056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.473051071166992, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6988576650619507, + "num_tokens": 613576634.0, + "step": 23717 + }, + { + "epoch": 2.6046562705908194, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5052695274353027, + "learning_rate": 1e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.7088582515716553, + "num_tokens": 613600814.0, + "step": 23718 + }, + { + "epoch": 2.6047660882934327, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.405167579650879, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7109083533287048, + "num_tokens": 613626305.0, + "step": 23719 + }, + { + "epoch": 2.6048759059960465, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3271567821502686, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.720912754535675, + "num_tokens": 613654566.0, + "step": 23720 + }, + { + "epoch": 2.60498572369866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4415109157562256, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.726686418056488, + "num_tokens": 613678702.0, + "step": 23721 + }, + { + "epoch": 2.605095541401274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5003764629364014, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6844232082366943, + "num_tokens": 613705791.0, + "step": 23722 + }, + { + "epoch": 2.6052053591038877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3224921226501465, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.703886091709137, + "num_tokens": 613734905.0, + "step": 23723 + }, + { + "epoch": 2.605315176806501, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.609006404876709, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7331111431121826, + "num_tokens": 613758211.0, + "step": 23724 + }, + { + "epoch": 2.605424994509115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7966530323028564, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7055562734603882, + "num_tokens": 613778836.0, + "step": 23725 + }, + { + "epoch": 2.6055348122117286, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.44201135635376, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7052878141403198, + "num_tokens": 613801624.0, + "step": 23726 + }, + { + "epoch": 2.6056446299143423, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4253454208374023, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7121856212615967, + "num_tokens": 613829644.0, + "step": 23727 + }, + { + "epoch": 2.605754447616956, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5524983406066895, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.719266414642334, + "num_tokens": 613853936.0, + "step": 23728 + }, + { + "epoch": 2.6058642653195694, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.744713306427002, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7316418290138245, + "num_tokens": 613873765.0, + "step": 23729 + }, + { + "epoch": 2.605974083022183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3634262084960938, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7111396789550781, + "num_tokens": 613901446.0, + "step": 23730 + }, + { + "epoch": 2.606083900724797, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4377965927124023, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7230979204177856, + "num_tokens": 613926484.0, + "step": 23731 + }, + { + "epoch": 2.6061937184274107, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.370509147644043, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.711341142654419, + "num_tokens": 613955178.0, + "step": 23732 + }, + { + "epoch": 2.6063035361300244, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2091522216796875, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7021601796150208, + "num_tokens": 613986262.0, + "step": 23733 + }, + { + "epoch": 2.6064133538326377, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8585104942321777, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7402970790863037, + "num_tokens": 614004905.0, + "step": 23734 + }, + { + "epoch": 2.6065231715352515, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.528538703918457, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7212024927139282, + "num_tokens": 614028283.0, + "step": 23735 + }, + { + "epoch": 2.6066329892378652, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8911259174346924, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7253263592720032, + "num_tokens": 614046571.0, + "step": 23736 + }, + { + "epoch": 2.6067428069404786, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.770949602127075, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.732492208480835, + "num_tokens": 614067079.0, + "step": 23737 + }, + { + "epoch": 2.6068526246430923, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.523172616958618, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7080618143081665, + "num_tokens": 614092766.0, + "step": 23738 + }, + { + "epoch": 2.606962442345706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.558527946472168, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7185322046279907, + "num_tokens": 614115213.0, + "step": 23739 + }, + { + "epoch": 2.60707226004832, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7499094009399414, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7203229069709778, + "num_tokens": 614137069.0, + "step": 23740 + }, + { + "epoch": 2.6071820777509336, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7845985889434814, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7380410432815552, + "num_tokens": 614160859.0, + "step": 23741 + }, + { + "epoch": 2.607291895453547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.264967203140259, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7084395289421082, + "num_tokens": 614188900.0, + "step": 23742 + }, + { + "epoch": 2.6074017131561606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3439886569976807, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7112407684326172, + "num_tokens": 614217035.0, + "step": 23743 + }, + { + "epoch": 2.6075115308587744, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.171209335327148, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.707019567489624, + "num_tokens": 614239744.0, + "step": 23744 + }, + { + "epoch": 2.607621348561388, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4791665077209473, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7201352119445801, + "num_tokens": 614266888.0, + "step": 23745 + }, + { + "epoch": 2.607731166264002, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400908946990967, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6966788172721863, + "num_tokens": 614295459.0, + "step": 23746 + }, + { + "epoch": 2.6078409839666152, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.477661609649658, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7246297001838684, + "num_tokens": 614319657.0, + "step": 23747 + }, + { + "epoch": 2.607950801669229, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5822463035583496, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7153249979019165, + "num_tokens": 614343494.0, + "step": 23748 + }, + { + "epoch": 2.6080606193718427, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.354919672012329, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7172307968139648, + "num_tokens": 614371395.0, + "step": 23749 + }, + { + "epoch": 2.6081704370744565, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.504289388656616, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7189654111862183, + "num_tokens": 614398280.0, + "step": 23750 + }, + { + "epoch": 2.6082802547770703, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5991899967193604, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7177654504776001, + "num_tokens": 614423501.0, + "step": 23751 + }, + { + "epoch": 2.6083900724796836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8979544639587402, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7303438186645508, + "num_tokens": 614443447.0, + "step": 23752 + }, + { + "epoch": 2.6084998901822973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4785656929016113, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.694546639919281, + "num_tokens": 614470168.0, + "step": 23753 + }, + { + "epoch": 2.608609707884911, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3592617511749268, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.715882420539856, + "num_tokens": 614496569.0, + "step": 23754 + }, + { + "epoch": 2.608719525587525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.431236743927002, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7055224180221558, + "num_tokens": 614522567.0, + "step": 23755 + }, + { + "epoch": 2.6088293432901386, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.421058177947998, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7087589502334595, + "num_tokens": 614547755.0, + "step": 23756 + }, + { + "epoch": 2.608939160992752, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5009076595306396, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.7044462561607361, + "num_tokens": 614574353.0, + "step": 23757 + }, + { + "epoch": 2.6090489786953657, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.383185863494873, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7176027297973633, + "num_tokens": 614600494.0, + "step": 23758 + }, + { + "epoch": 2.6091587963979794, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7194676399230957, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7087456583976746, + "num_tokens": 614624797.0, + "step": 23759 + }, + { + "epoch": 2.6092686141005927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3760335445404053, + "learning_rate": 1e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6822549104690552, + "num_tokens": 614655913.0, + "step": 23760 + }, + { + "epoch": 2.609378431803207, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5283501148223877, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7352540493011475, + "num_tokens": 614679388.0, + "step": 23761 + }, + { + "epoch": 2.6094882495058203, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2379019260406494, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7256415486335754, + "num_tokens": 614709117.0, + "step": 23762 + }, + { + "epoch": 2.609598067208434, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.953416347503662, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7318874001502991, + "num_tokens": 614727989.0, + "step": 23763 + }, + { + "epoch": 2.6097078849110478, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6092545986175537, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7279185652732849, + "num_tokens": 614750503.0, + "step": 23764 + }, + { + "epoch": 2.609817702613661, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.681361198425293, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7131847143173218, + "num_tokens": 614772555.0, + "step": 23765 + }, + { + "epoch": 2.609927520316275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4584226608276367, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7010648250579834, + "num_tokens": 614798477.0, + "step": 23766 + }, + { + "epoch": 2.6100373380188886, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.416374921798706, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7284489870071411, + "num_tokens": 614821899.0, + "step": 23767 + }, + { + "epoch": 2.6101471557215024, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 6.956957817077637, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7216393947601318, + "num_tokens": 614853929.0, + "step": 23768 + }, + { + "epoch": 2.610256973424116, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2958991527557373, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7334717512130737, + "num_tokens": 614880210.0, + "step": 23769 + }, + { + "epoch": 2.6103667911267294, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5349249839782715, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7371689081192017, + "num_tokens": 614903738.0, + "step": 23770 + }, + { + "epoch": 2.610476608829343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2497804164886475, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7419804930686951, + "num_tokens": 614929478.0, + "step": 23771 + }, + { + "epoch": 2.610586426531957, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4467782974243164, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7060058116912842, + "num_tokens": 614955346.0, + "step": 23772 + }, + { + "epoch": 2.6106962442345707, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3204195499420166, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7235664129257202, + "num_tokens": 614982587.0, + "step": 23773 + }, + { + "epoch": 2.6108060619371845, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513756513595581, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.701054036617279, + "num_tokens": 615006632.0, + "step": 23774 + }, + { + "epoch": 2.6109158796397978, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3168060779571533, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7156662940979004, + "num_tokens": 615035959.0, + "step": 23775 + }, + { + "epoch": 2.6110256973424115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2476792335510254, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6993686556816101, + "num_tokens": 615064681.0, + "step": 23776 + }, + { + "epoch": 2.6111355150450253, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.4868838787078857, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7141587138175964, + "num_tokens": 615090058.0, + "step": 23777 + }, + { + "epoch": 2.611245332747639, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3883748054504395, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7281578779220581, + "num_tokens": 615115131.0, + "step": 23778 + }, + { + "epoch": 2.611355150450253, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.569126605987549, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7387807369232178, + "num_tokens": 615137660.0, + "step": 23779 + }, + { + "epoch": 2.611464968152866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.341647148132324, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7243639230728149, + "num_tokens": 615164764.0, + "step": 23780 + }, + { + "epoch": 2.61157478585548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2396023273468018, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7211371660232544, + "num_tokens": 615193518.0, + "step": 23781 + }, + { + "epoch": 2.6116846035580936, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7121589183807373, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.72878098487854, + "num_tokens": 615214833.0, + "step": 23782 + }, + { + "epoch": 2.6117944212607074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3697237968444824, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7177306413650513, + "num_tokens": 615240146.0, + "step": 23783 + }, + { + "epoch": 2.611904238963321, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.343440532684326, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7206156253814697, + "num_tokens": 615266486.0, + "step": 23784 + }, + { + "epoch": 2.6120140566659344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.363147497177124, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.716314435005188, + "num_tokens": 615295792.0, + "step": 23785 + }, + { + "epoch": 2.612123874368548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.308156967163086, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7485759258270264, + "num_tokens": 615322940.0, + "step": 23786 + }, + { + "epoch": 2.612233692071162, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6911048889160156, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7209211587905884, + "num_tokens": 615344774.0, + "step": 23787 + }, + { + "epoch": 2.6123435097737753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68245792388916, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7278932332992554, + "num_tokens": 615365924.0, + "step": 23788 + }, + { + "epoch": 2.612453327476389, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.782472610473633, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7159783840179443, + "num_tokens": 615390288.0, + "step": 23789 + }, + { + "epoch": 2.612563145179003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4995059967041016, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7405149936676025, + "num_tokens": 615413048.0, + "step": 23790 + }, + { + "epoch": 2.6126729628816165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3821187019348145, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7292237877845764, + "num_tokens": 615436740.0, + "step": 23791 + }, + { + "epoch": 2.6127827805842303, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.317786455154419, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7191873788833618, + "num_tokens": 615462034.0, + "step": 23792 + }, + { + "epoch": 2.6128925982868436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4225103855133057, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7106610536575317, + "num_tokens": 615488700.0, + "step": 23793 + }, + { + "epoch": 2.6130024159894574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5655744075775146, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7163043022155762, + "num_tokens": 615513606.0, + "step": 23794 + }, + { + "epoch": 2.613112233692071, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.607621192932129, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7128041982650757, + "num_tokens": 615535227.0, + "step": 23795 + }, + { + "epoch": 2.613222051394685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.286059856414795, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7020763158798218, + "num_tokens": 615562831.0, + "step": 23796 + }, + { + "epoch": 2.6133318690972986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2425127029418945, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6904373168945312, + "num_tokens": 615592648.0, + "step": 23797 + }, + { + "epoch": 2.613441686799912, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6723883152008057, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7163231372833252, + "num_tokens": 615614363.0, + "step": 23798 + }, + { + "epoch": 2.6135515045025257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2059733867645264, + "learning_rate": 1e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6860494613647461, + "num_tokens": 615647265.0, + "step": 23799 + }, + { + "epoch": 2.6136613222051395, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5289134979248047, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7321508526802063, + "num_tokens": 615671285.0, + "step": 23800 + }, + { + "epoch": 2.6137711399077532, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.217224359512329, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6981621980667114, + "num_tokens": 615704263.0, + "step": 23801 + }, + { + "epoch": 2.613880957610367, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.482738971710205, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.714677631855011, + "num_tokens": 615728820.0, + "step": 23802 + }, + { + "epoch": 2.6139907753129803, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2275612354278564, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7003442049026489, + "num_tokens": 615758594.0, + "step": 23803 + }, + { + "epoch": 2.614100593015594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2477667331695557, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6896355152130127, + "num_tokens": 615789841.0, + "step": 23804 + }, + { + "epoch": 2.614210410718208, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.639119863510132, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7127243876457214, + "num_tokens": 615811739.0, + "step": 23805 + }, + { + "epoch": 2.6143202284208216, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2031121253967285, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7069908976554871, + "num_tokens": 615843106.0, + "step": 23806 + }, + { + "epoch": 2.6144300461234353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4361801147460938, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7236922979354858, + "num_tokens": 615870852.0, + "step": 23807 + }, + { + "epoch": 2.6145398638260486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.46628999710083, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7409796714782715, + "num_tokens": 615893427.0, + "step": 23808 + }, + { + "epoch": 2.6146496815286624, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3632378578186035, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6909902095794678, + "num_tokens": 615922853.0, + "step": 23809 + }, + { + "epoch": 2.614759499231276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7145509719848633, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7112282514572144, + "num_tokens": 615944594.0, + "step": 23810 + }, + { + "epoch": 2.61486931693389, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6164164543151855, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7200295925140381, + "num_tokens": 615968025.0, + "step": 23811 + }, + { + "epoch": 2.6149791346365037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4952683448791504, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6958389282226562, + "num_tokens": 615996177.0, + "step": 23812 + }, + { + "epoch": 2.615088952339117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3667829036712646, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7122719287872314, + "num_tokens": 616023342.0, + "step": 23813 + }, + { + "epoch": 2.6151987700417307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1516857147216797, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7098690271377563, + "num_tokens": 616052964.0, + "step": 23814 + }, + { + "epoch": 2.6153085877443445, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43810772895813, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7049090266227722, + "num_tokens": 616078639.0, + "step": 23815 + }, + { + "epoch": 2.615418405446958, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.351491451263428, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7212101221084595, + "num_tokens": 616108430.0, + "step": 23816 + }, + { + "epoch": 2.6155282231495716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443791389465332, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.727869987487793, + "num_tokens": 616132651.0, + "step": 23817 + }, + { + "epoch": 2.6156380408521853, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3733069896698, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6959063410758972, + "num_tokens": 616160477.0, + "step": 23818 + }, + { + "epoch": 2.615747858554799, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7286946773529053, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7182112336158752, + "num_tokens": 616181366.0, + "step": 23819 + }, + { + "epoch": 2.615857676257413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7060556411743164, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.708141565322876, + "num_tokens": 616203339.0, + "step": 23820 + }, + { + "epoch": 2.615967493960026, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4773900508880615, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7275390028953552, + "num_tokens": 616226918.0, + "step": 23821 + }, + { + "epoch": 2.61607731166264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5058798789978027, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6948331594467163, + "num_tokens": 616253500.0, + "step": 23822 + }, + { + "epoch": 2.6161871293652537, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6171679496765137, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7346842288970947, + "num_tokens": 616275443.0, + "step": 23823 + }, + { + "epoch": 2.6162969470678674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5471508502960205, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7163482904434204, + "num_tokens": 616298483.0, + "step": 23824 + }, + { + "epoch": 2.616406764770481, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6314074993133545, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7120364904403687, + "num_tokens": 616321924.0, + "step": 23825 + }, + { + "epoch": 2.6165165824730945, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3805863857269287, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7110414505004883, + "num_tokens": 616349063.0, + "step": 23826 + }, + { + "epoch": 2.6166264001757082, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3070850372314453, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7075318098068237, + "num_tokens": 616375379.0, + "step": 23827 + }, + { + "epoch": 2.616736217878322, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5007381439208984, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7079681158065796, + "num_tokens": 616401438.0, + "step": 23828 + }, + { + "epoch": 2.6168460355809358, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2921504974365234, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7138504981994629, + "num_tokens": 616430323.0, + "step": 23829 + }, + { + "epoch": 2.6169558532835495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337123155593872, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7282731533050537, + "num_tokens": 616457365.0, + "step": 23830 + }, + { + "epoch": 2.617065670986163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3657288551330566, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7216522097587585, + "num_tokens": 616484086.0, + "step": 23831 + }, + { + "epoch": 2.6171754886887766, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1783370971679688, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6993823051452637, + "num_tokens": 616515260.0, + "step": 23832 + }, + { + "epoch": 2.6172853063913903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3108887672424316, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7070838809013367, + "num_tokens": 616545524.0, + "step": 23833 + }, + { + "epoch": 2.617395124094004, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4469683170318604, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7207199335098267, + "num_tokens": 616570629.0, + "step": 23834 + }, + { + "epoch": 2.617504941796618, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2035276889801025, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7132807970046997, + "num_tokens": 616600138.0, + "step": 23835 + }, + { + "epoch": 2.617614759499231, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8035342693328857, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6977049708366394, + "num_tokens": 616621489.0, + "step": 23836 + }, + { + "epoch": 2.617724577201845, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8465371131896973, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7234240770339966, + "num_tokens": 616642458.0, + "step": 23837 + }, + { + "epoch": 2.6178343949044587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.455639600753784, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7254385352134705, + "num_tokens": 616668748.0, + "step": 23838 + }, + { + "epoch": 2.617944212607072, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.156801700592041, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7148846387863159, + "num_tokens": 616700767.0, + "step": 23839 + }, + { + "epoch": 2.618054030309686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5053813457489014, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7315697073936462, + "num_tokens": 616725784.0, + "step": 23840 + }, + { + "epoch": 2.6181638480122995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5654449462890625, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7323430776596069, + "num_tokens": 616749963.0, + "step": 23841 + }, + { + "epoch": 2.6182736657149133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5453197956085205, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7163437604904175, + "num_tokens": 616773116.0, + "step": 23842 + }, + { + "epoch": 2.618383483417527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4030096530914307, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6850475668907166, + "num_tokens": 616799662.0, + "step": 23843 + }, + { + "epoch": 2.6184933011201403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2052013874053955, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7092260122299194, + "num_tokens": 616832181.0, + "step": 23844 + }, + { + "epoch": 2.618603118822754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.551809310913086, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.749586284160614, + "num_tokens": 616856032.0, + "step": 23845 + }, + { + "epoch": 2.618712936525368, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.454806089401245, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7055760622024536, + "num_tokens": 616882922.0, + "step": 23846 + }, + { + "epoch": 2.6188227542279816, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6363959312438965, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7194210886955261, + "num_tokens": 616905003.0, + "step": 23847 + }, + { + "epoch": 2.6189325719305954, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.374697208404541, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7124072909355164, + "num_tokens": 616934182.0, + "step": 23848 + }, + { + "epoch": 2.6190423896332087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.012555122375488, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.695976972579956, + "num_tokens": 616962792.0, + "step": 23849 + }, + { + "epoch": 2.6191522073358224, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3065223693847656, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7060120105743408, + "num_tokens": 616993715.0, + "step": 23850 + }, + { + "epoch": 2.619262025038436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7535712718963623, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7219470739364624, + "num_tokens": 617016131.0, + "step": 23851 + }, + { + "epoch": 2.61937184274105, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.195173501968384, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7011352777481079, + "num_tokens": 617047681.0, + "step": 23852 + }, + { + "epoch": 2.6194816604436637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.319955825805664, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7325834035873413, + "num_tokens": 617074011.0, + "step": 23853 + }, + { + "epoch": 2.619591478146277, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.412933349609375, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7251264452934265, + "num_tokens": 617098524.0, + "step": 23854 + }, + { + "epoch": 2.6197012958488908, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.271681547164917, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7085124850273132, + "num_tokens": 617125190.0, + "step": 23855 + }, + { + "epoch": 2.6198111135515045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.517564058303833, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.724132239818573, + "num_tokens": 617149480.0, + "step": 23856 + }, + { + "epoch": 2.6199209312541183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4322140216827393, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7126482129096985, + "num_tokens": 617175751.0, + "step": 23857 + }, + { + "epoch": 2.620030748956732, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2328991889953613, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.713392436504364, + "num_tokens": 617207187.0, + "step": 23858 + }, + { + "epoch": 2.6201405666593454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5040433406829834, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7208132743835449, + "num_tokens": 617230627.0, + "step": 23859 + }, + { + "epoch": 2.620250384361959, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5125207901000977, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.681879460811615, + "num_tokens": 617255474.0, + "step": 23860 + }, + { + "epoch": 2.620360202064573, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.302626609802246, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7179149389266968, + "num_tokens": 617283950.0, + "step": 23861 + }, + { + "epoch": 2.6204700197671866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.28428053855896, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7083327174186707, + "num_tokens": 617311662.0, + "step": 23862 + }, + { + "epoch": 2.6205798374698004, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4537267684936523, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.729084849357605, + "num_tokens": 617337524.0, + "step": 23863 + }, + { + "epoch": 2.6206896551724137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.628816604614258, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.716099739074707, + "num_tokens": 617362717.0, + "step": 23864 + }, + { + "epoch": 2.6207994728750275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3401660919189453, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7100081443786621, + "num_tokens": 617390882.0, + "step": 23865 + }, + { + "epoch": 2.620909290577641, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.352781295776367, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6979215741157532, + "num_tokens": 617419244.0, + "step": 23866 + }, + { + "epoch": 2.6210191082802545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6331169605255127, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7242242097854614, + "num_tokens": 617440822.0, + "step": 23867 + }, + { + "epoch": 2.6211289259828683, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2545692920684814, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7061645984649658, + "num_tokens": 617472444.0, + "step": 23868 + }, + { + "epoch": 2.621238743685482, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.432284116744995, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.724185049533844, + "num_tokens": 617498360.0, + "step": 23869 + }, + { + "epoch": 2.621348561388096, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4745500087738037, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.716239333152771, + "num_tokens": 617524776.0, + "step": 23870 + }, + { + "epoch": 2.6214583790907096, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1554813385009766, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6991218328475952, + "num_tokens": 617559133.0, + "step": 23871 + }, + { + "epoch": 2.621568196793323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.158064365386963, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7052139043807983, + "num_tokens": 617591498.0, + "step": 23872 + }, + { + "epoch": 2.6216780144959366, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265739917755127, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7129380702972412, + "num_tokens": 617622204.0, + "step": 23873 + }, + { + "epoch": 2.6217878321985504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7282702922821045, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7045317888259888, + "num_tokens": 617645255.0, + "step": 23874 + }, + { + "epoch": 2.621897649901164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4898273944854736, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7280244827270508, + "num_tokens": 617668651.0, + "step": 23875 + }, + { + "epoch": 2.622007467603778, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5545406341552734, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6893541812896729, + "num_tokens": 617693808.0, + "step": 23876 + }, + { + "epoch": 2.622117285306391, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6762564182281494, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7377755641937256, + "num_tokens": 617714789.0, + "step": 23877 + }, + { + "epoch": 2.622227103009005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4930477142333984, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7124274969100952, + "num_tokens": 617740029.0, + "step": 23878 + }, + { + "epoch": 2.6223369207116187, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.571687698364258, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7247328162193298, + "num_tokens": 617763305.0, + "step": 23879 + }, + { + "epoch": 2.6224467384142325, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4226107597351074, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7374952435493469, + "num_tokens": 617788975.0, + "step": 23880 + }, + { + "epoch": 2.6225565561168462, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.246366500854492, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7121454477310181, + "num_tokens": 617821155.0, + "step": 23881 + }, + { + "epoch": 2.6226663738194596, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.665800094604492, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7351862192153931, + "num_tokens": 617843279.0, + "step": 23882 + }, + { + "epoch": 2.6227761915220733, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3306078910827637, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7039986848831177, + "num_tokens": 617871053.0, + "step": 23883 + }, + { + "epoch": 2.622886009224687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.142376184463501, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7235498428344727, + "num_tokens": 617901241.0, + "step": 23884 + }, + { + "epoch": 2.622995826927301, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6298828125, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6944453716278076, + "num_tokens": 617925776.0, + "step": 23885 + }, + { + "epoch": 2.6231056446299146, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7043166160583496, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7514424324035645, + "num_tokens": 617947516.0, + "step": 23886 + }, + { + "epoch": 2.623215462332528, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.0993459224700928, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7251717448234558, + "num_tokens": 617966419.0, + "step": 23887 + }, + { + "epoch": 2.6233252800351416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.642287254333496, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7174117565155029, + "num_tokens": 617991109.0, + "step": 23888 + }, + { + "epoch": 2.6234350977377554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3187625408172607, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.704197347164154, + "num_tokens": 618020680.0, + "step": 23889 + }, + { + "epoch": 2.6235449154403687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526973247528076, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7334426641464233, + "num_tokens": 618043607.0, + "step": 23890 + }, + { + "epoch": 2.623654733142983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3081657886505127, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7093049883842468, + "num_tokens": 618072111.0, + "step": 23891 + }, + { + "epoch": 2.6237645508455962, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.473874807357788, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7135829925537109, + "num_tokens": 618095676.0, + "step": 23892 + }, + { + "epoch": 2.62387436854821, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2953743934631348, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7099584341049194, + "num_tokens": 618122311.0, + "step": 23893 + }, + { + "epoch": 2.6239841862508237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5564138889312744, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7210152745246887, + "num_tokens": 618146648.0, + "step": 23894 + }, + { + "epoch": 2.624094003953437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6863412857055664, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7356185913085938, + "num_tokens": 618167716.0, + "step": 23895 + }, + { + "epoch": 2.624203821656051, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3617119789123535, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7175109386444092, + "num_tokens": 618194484.0, + "step": 23896 + }, + { + "epoch": 2.6243136393586646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6432437896728516, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7221726775169373, + "num_tokens": 618215614.0, + "step": 23897 + }, + { + "epoch": 2.6244234570612783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5308499336242676, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.7071932554244995, + "num_tokens": 618241722.0, + "step": 23898 + }, + { + "epoch": 2.624533274763892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3415920734405518, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7082639932632446, + "num_tokens": 618270986.0, + "step": 23899 + }, + { + "epoch": 2.6246430924665054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3883895874023438, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.716060221195221, + "num_tokens": 618298025.0, + "step": 23900 + }, + { + "epoch": 2.624752910169119, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7255918979644775, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7135674357414246, + "num_tokens": 618320589.0, + "step": 23901 + }, + { + "epoch": 2.624862727871733, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6107969284057617, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7423563003540039, + "num_tokens": 618343548.0, + "step": 23902 + }, + { + "epoch": 2.6249725455743467, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5198888778686523, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7389320135116577, + "num_tokens": 618366871.0, + "step": 23903 + }, + { + "epoch": 2.6250823632769604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2767863273620605, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7144930362701416, + "num_tokens": 618397442.0, + "step": 23904 + }, + { + "epoch": 2.6251921809795737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265244483947754, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7115679979324341, + "num_tokens": 618425474.0, + "step": 23905 + }, + { + "epoch": 2.6253019986821875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.287875175476074, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7441512942314148, + "num_tokens": 618451059.0, + "step": 23906 + }, + { + "epoch": 2.6254118163848013, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.282470464706421, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7008113861083984, + "num_tokens": 618481342.0, + "step": 23907 + }, + { + "epoch": 2.625521634087415, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2157952785491943, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7178784012794495, + "num_tokens": 618512365.0, + "step": 23908 + }, + { + "epoch": 2.6256314517900288, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3958559036254883, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7187073230743408, + "num_tokens": 618540595.0, + "step": 23909 + }, + { + "epoch": 2.625741269492642, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.667707681655884, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7150704264640808, + "num_tokens": 618562278.0, + "step": 23910 + }, + { + "epoch": 2.625851087195256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4404714107513428, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7228824496269226, + "num_tokens": 618587828.0, + "step": 23911 + }, + { + "epoch": 2.6259609048978696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.308847665786743, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.699593186378479, + "num_tokens": 618615196.0, + "step": 23912 + }, + { + "epoch": 2.6260707226004834, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2027816772460938, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7352621555328369, + "num_tokens": 618643927.0, + "step": 23913 + }, + { + "epoch": 2.626180540303097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1804652214050293, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6962579488754272, + "num_tokens": 618676520.0, + "step": 23914 + }, + { + "epoch": 2.6262903580057104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4800403118133545, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7096858620643616, + "num_tokens": 618701042.0, + "step": 23915 + }, + { + "epoch": 2.626400175708324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4468536376953125, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7235995531082153, + "num_tokens": 618727378.0, + "step": 23916 + }, + { + "epoch": 2.626509993410938, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328308582305908, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7087498903274536, + "num_tokens": 618754896.0, + "step": 23917 + }, + { + "epoch": 2.6266198111135513, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4149248600006104, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7230426073074341, + "num_tokens": 618778671.0, + "step": 23918 + }, + { + "epoch": 2.626729628816165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.394606590270996, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.7074377536773682, + "num_tokens": 618804593.0, + "step": 23919 + }, + { + "epoch": 2.6268394465187788, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3865153789520264, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7164625525474548, + "num_tokens": 618829558.0, + "step": 23920 + }, + { + "epoch": 2.6269492642213925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.363797664642334, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7130531072616577, + "num_tokens": 618856096.0, + "step": 23921 + }, + { + "epoch": 2.6270590819240063, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2674007415771484, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.708269476890564, + "num_tokens": 618887366.0, + "step": 23922 + }, + { + "epoch": 2.6271688996266196, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.180724620819092, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7069072723388672, + "num_tokens": 618918351.0, + "step": 23923 + }, + { + "epoch": 2.6272787173292333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5579142570495605, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7081925868988037, + "num_tokens": 618944233.0, + "step": 23924 + }, + { + "epoch": 2.627388535031847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5882315635681152, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7172004580497742, + "num_tokens": 618968428.0, + "step": 23925 + }, + { + "epoch": 2.627498352734461, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8294174671173096, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7294818162918091, + "num_tokens": 618989435.0, + "step": 23926 + }, + { + "epoch": 2.6276081704370746, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5795111656188965, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7075353860855103, + "num_tokens": 619011294.0, + "step": 23927 + }, + { + "epoch": 2.627717988139688, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7468600273132324, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7237904667854309, + "num_tokens": 619031707.0, + "step": 23928 + }, + { + "epoch": 2.6278278058423017, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4408438205718994, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7286620140075684, + "num_tokens": 619056051.0, + "step": 23929 + }, + { + "epoch": 2.6279376235449154, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.534106969833374, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7119144201278687, + "num_tokens": 619078179.0, + "step": 23930 + }, + { + "epoch": 2.628047441247529, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2532150745391846, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7149457931518555, + "num_tokens": 619107033.0, + "step": 23931 + }, + { + "epoch": 2.628157258950143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.511723518371582, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7062186598777771, + "num_tokens": 619134208.0, + "step": 23932 + }, + { + "epoch": 2.6282670766527563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400747060775757, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7107617259025574, + "num_tokens": 619160739.0, + "step": 23933 + }, + { + "epoch": 2.62837689435537, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6036224365234375, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7125685811042786, + "num_tokens": 619182600.0, + "step": 23934 + }, + { + "epoch": 2.628486712057984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5303592681884766, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7038596868515015, + "num_tokens": 619207172.0, + "step": 23935 + }, + { + "epoch": 2.6285965297605975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.437761068344116, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7234736680984497, + "num_tokens": 619234152.0, + "step": 23936 + }, + { + "epoch": 2.6287063474632113, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2180583477020264, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7108813524246216, + "num_tokens": 619264240.0, + "step": 23937 + }, + { + "epoch": 2.6288161651658246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6777503490448, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.710360050201416, + "num_tokens": 619292047.0, + "step": 23938 + }, + { + "epoch": 2.6289259828684384, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5568621158599854, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7268712520599365, + "num_tokens": 619315084.0, + "step": 23939 + }, + { + "epoch": 2.629035800571052, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.9147918224334717, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7111026048660278, + "num_tokens": 619335129.0, + "step": 23940 + }, + { + "epoch": 2.6291456182736654, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3827929496765137, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7095405459403992, + "num_tokens": 619362365.0, + "step": 23941 + }, + { + "epoch": 2.6292554359762796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5197980403900146, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7089422345161438, + "num_tokens": 619385774.0, + "step": 23942 + }, + { + "epoch": 2.629365253678893, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4347870349884033, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7474665641784668, + "num_tokens": 619411958.0, + "step": 23943 + }, + { + "epoch": 2.6294750713815067, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.160592794418335, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6955408453941345, + "num_tokens": 619446957.0, + "step": 23944 + }, + { + "epoch": 2.6295848890841205, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4195444583892822, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7168483734130859, + "num_tokens": 619477566.0, + "step": 23945 + }, + { + "epoch": 2.629694706786734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.937992572784424, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7233278751373291, + "num_tokens": 619501131.0, + "step": 23946 + }, + { + "epoch": 2.6298045244893475, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.111844301223755, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7113713026046753, + "num_tokens": 619535315.0, + "step": 23947 + }, + { + "epoch": 2.6299143421919613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.697683572769165, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7185319662094116, + "num_tokens": 619556423.0, + "step": 23948 + }, + { + "epoch": 2.630024159894575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.179964780807495, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6959011554718018, + "num_tokens": 619587941.0, + "step": 23949 + }, + { + "epoch": 2.630133977597189, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5427756309509277, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7195670008659363, + "num_tokens": 619612379.0, + "step": 23950 + }, + { + "epoch": 2.630243795299802, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5913565158843994, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7268238067626953, + "num_tokens": 619634800.0, + "step": 23951 + }, + { + "epoch": 2.630353613002416, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4904654026031494, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7162038087844849, + "num_tokens": 619661653.0, + "step": 23952 + }, + { + "epoch": 2.6304634307050296, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.393970489501953, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7196987867355347, + "num_tokens": 619688426.0, + "step": 23953 + }, + { + "epoch": 2.6305732484076434, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.64574933052063, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7143744826316833, + "num_tokens": 619711954.0, + "step": 23954 + }, + { + "epoch": 2.630683066110257, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.358393907546997, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.7035588622093201, + "num_tokens": 619740104.0, + "step": 23955 + }, + { + "epoch": 2.6307928838128705, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.56129789352417, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.7107488512992859, + "num_tokens": 619763446.0, + "step": 23956 + }, + { + "epoch": 2.630902701515484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4784417152404785, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6964621543884277, + "num_tokens": 619789381.0, + "step": 23957 + }, + { + "epoch": 2.631012519218098, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.512699604034424, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7181786894798279, + "num_tokens": 619815071.0, + "step": 23958 + }, + { + "epoch": 2.6311223369207117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5866858959198, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7289919853210449, + "num_tokens": 619836300.0, + "step": 23959 + }, + { + "epoch": 2.6312321546233255, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.297619342803955, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.7020496129989624, + "num_tokens": 619864827.0, + "step": 23960 + }, + { + "epoch": 2.631341972325939, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.668506383895874, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7292343378067017, + "num_tokens": 619886514.0, + "step": 23961 + }, + { + "epoch": 2.6314517900285526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4496519565582275, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7166119813919067, + "num_tokens": 619911601.0, + "step": 23962 + }, + { + "epoch": 2.6315616077311663, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.776193618774414, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7206626534461975, + "num_tokens": 619933542.0, + "step": 23963 + }, + { + "epoch": 2.63167142543378, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.304722309112549, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7176058888435364, + "num_tokens": 619962953.0, + "step": 23964 + }, + { + "epoch": 2.631781243136394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.8015549182891846, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7133257389068604, + "num_tokens": 619990685.0, + "step": 23965 + }, + { + "epoch": 2.631891060839007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4936718940734863, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7112200260162354, + "num_tokens": 620016539.0, + "step": 23966 + }, + { + "epoch": 2.632000878541621, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1770217418670654, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7030060291290283, + "num_tokens": 620048256.0, + "step": 23967 + }, + { + "epoch": 2.6321106962442347, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.783719301223755, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7190837264060974, + "num_tokens": 620071741.0, + "step": 23968 + }, + { + "epoch": 2.632220513946848, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498870611190796, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7096536159515381, + "num_tokens": 620096596.0, + "step": 23969 + }, + { + "epoch": 2.6323303316494617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.669477701187134, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7138694524765015, + "num_tokens": 620118757.0, + "step": 23970 + }, + { + "epoch": 2.6324401493520755, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2762787342071533, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.683200478553772, + "num_tokens": 620152284.0, + "step": 23971 + }, + { + "epoch": 2.6325499670546892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6825544834136963, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7093784809112549, + "num_tokens": 620174262.0, + "step": 23972 + }, + { + "epoch": 2.632659784757303, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2145349979400635, + "learning_rate": 1e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.6737253665924072, + "num_tokens": 620205473.0, + "step": 23973 + }, + { + "epoch": 2.6327696024599163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4706666469573975, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6984050869941711, + "num_tokens": 620231736.0, + "step": 23974 + }, + { + "epoch": 2.63287942016253, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3213632106781006, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7345166206359863, + "num_tokens": 620260991.0, + "step": 23975 + }, + { + "epoch": 2.632989237865144, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2465665340423584, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7173902988433838, + "num_tokens": 620288854.0, + "step": 23976 + }, + { + "epoch": 2.6330990555677576, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328573703765869, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6919419765472412, + "num_tokens": 620317596.0, + "step": 23977 + }, + { + "epoch": 2.6332088732703713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.489048957824707, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7375921010971069, + "num_tokens": 620341641.0, + "step": 23978 + }, + { + "epoch": 2.6333186909729847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2741315364837646, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7144383192062378, + "num_tokens": 620371222.0, + "step": 23979 + }, + { + "epoch": 2.6334285086755984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5727407932281494, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7263224720954895, + "num_tokens": 620395234.0, + "step": 23980 + }, + { + "epoch": 2.633538326378212, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2829551696777344, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7169637084007263, + "num_tokens": 620423716.0, + "step": 23981 + }, + { + "epoch": 2.633648144080826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7455244064331055, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7150458097457886, + "num_tokens": 620445412.0, + "step": 23982 + }, + { + "epoch": 2.6337579617834397, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5992441177368164, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.734250009059906, + "num_tokens": 620468398.0, + "step": 23983 + }, + { + "epoch": 2.633867779486053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4476897716522217, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7144247889518738, + "num_tokens": 620492287.0, + "step": 23984 + }, + { + "epoch": 2.6339775971886668, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.641735076904297, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7221092581748962, + "num_tokens": 620514595.0, + "step": 23985 + }, + { + "epoch": 2.6340874148912805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.368781566619873, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7158013582229614, + "num_tokens": 620543280.0, + "step": 23986 + }, + { + "epoch": 2.6341972325938943, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6080515384674072, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7122832536697388, + "num_tokens": 620564893.0, + "step": 23987 + }, + { + "epoch": 2.634307050296508, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4950315952301025, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7010577321052551, + "num_tokens": 620592556.0, + "step": 23988 + }, + { + "epoch": 2.6344168679991213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.289406776428223, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7363995313644409, + "num_tokens": 620612564.0, + "step": 23989 + }, + { + "epoch": 2.634526685701735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4195492267608643, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7296978235244751, + "num_tokens": 620637624.0, + "step": 23990 + }, + { + "epoch": 2.634636503404349, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4835808277130127, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7085897922515869, + "num_tokens": 620662855.0, + "step": 23991 + }, + { + "epoch": 2.6347463211069626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5860118865966797, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.736340343952179, + "num_tokens": 620684595.0, + "step": 23992 + }, + { + "epoch": 2.6348561388095764, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.563636302947998, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7138156294822693, + "num_tokens": 620710732.0, + "step": 23993 + }, + { + "epoch": 2.6349659565121897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2517433166503906, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6968193650245667, + "num_tokens": 620740712.0, + "step": 23994 + }, + { + "epoch": 2.6350757742148034, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6128180027008057, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.720313310623169, + "num_tokens": 620764298.0, + "step": 23995 + }, + { + "epoch": 2.635185591917417, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3774187564849854, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7052216529846191, + "num_tokens": 620790394.0, + "step": 23996 + }, + { + "epoch": 2.6352954096200305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6959569454193115, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6969650983810425, + "num_tokens": 620817893.0, + "step": 23997 + }, + { + "epoch": 2.6354052273226443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2138636112213135, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7293483018875122, + "num_tokens": 620844533.0, + "step": 23998 + }, + { + "epoch": 2.635515045025258, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.745300531387329, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.7075046300888062, + "num_tokens": 620867060.0, + "step": 23999 + }, + { + "epoch": 2.6356248627278718, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430137872695923, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7150198817253113, + "num_tokens": 620893365.0, + "step": 24000 + }, + { + "epoch": 2.6357346804304855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3488717079162598, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7284285426139832, + "num_tokens": 620920208.0, + "step": 24001 + }, + { + "epoch": 2.635844498133099, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.620171070098877, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7064005732536316, + "num_tokens": 620942974.0, + "step": 24002 + }, + { + "epoch": 2.6359543158357126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4036567211151123, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6991658210754395, + "num_tokens": 620970769.0, + "step": 24003 + }, + { + "epoch": 2.6360641335383264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1966118812561035, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6895145773887634, + "num_tokens": 621003008.0, + "step": 24004 + }, + { + "epoch": 2.63617395124094, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.296196460723877, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7092284560203552, + "num_tokens": 621033307.0, + "step": 24005 + }, + { + "epoch": 2.636283768943554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.249940872192383, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.712375283241272, + "num_tokens": 621063311.0, + "step": 24006 + }, + { + "epoch": 2.636393586646167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.374905586242676, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7097514867782593, + "num_tokens": 621091081.0, + "step": 24007 + }, + { + "epoch": 2.636503404348781, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.828317403793335, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6982696056365967, + "num_tokens": 621111113.0, + "step": 24008 + }, + { + "epoch": 2.6366132220513947, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.482103109359741, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7114250063896179, + "num_tokens": 621137657.0, + "step": 24009 + }, + { + "epoch": 2.6367230397540085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.691962957382202, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7334557771682739, + "num_tokens": 621157945.0, + "step": 24010 + }, + { + "epoch": 2.636832857456622, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.698503017425537, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6981804370880127, + "num_tokens": 621178900.0, + "step": 24011 + }, + { + "epoch": 2.6369426751592355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3770558834075928, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7168266177177429, + "num_tokens": 621204561.0, + "step": 24012 + }, + { + "epoch": 2.6370524928618493, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6359245777130127, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7145509719848633, + "num_tokens": 621226032.0, + "step": 24013 + }, + { + "epoch": 2.637162310564463, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6960561275482178, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7088869214057922, + "num_tokens": 621245472.0, + "step": 24014 + }, + { + "epoch": 2.637272128267077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6064605712890625, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7069881558418274, + "num_tokens": 621266922.0, + "step": 24015 + }, + { + "epoch": 2.6373819459696906, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.331777811050415, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7063888907432556, + "num_tokens": 621296209.0, + "step": 24016 + }, + { + "epoch": 2.637491763672304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6887896060943604, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7048657536506653, + "num_tokens": 621318693.0, + "step": 24017 + }, + { + "epoch": 2.6376015813749176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.340567111968994, + "learning_rate": 1e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.6833178400993347, + "num_tokens": 621347752.0, + "step": 24018 + }, + { + "epoch": 2.6377113990775314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.375765085220337, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7432854771614075, + "num_tokens": 621373631.0, + "step": 24019 + }, + { + "epoch": 2.6378212167801447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4766368865966797, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7266759276390076, + "num_tokens": 621398262.0, + "step": 24020 + }, + { + "epoch": 2.637931034482759, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3874313831329346, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7398955225944519, + "num_tokens": 621423360.0, + "step": 24021 + }, + { + "epoch": 2.638040852185372, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.831282138824463, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.718454897403717, + "num_tokens": 621451716.0, + "step": 24022 + }, + { + "epoch": 2.638150669887986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5293521881103516, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7039434909820557, + "num_tokens": 621475198.0, + "step": 24023 + }, + { + "epoch": 2.6382604875905997, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5505926609039307, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7109244465827942, + "num_tokens": 621499649.0, + "step": 24024 + }, + { + "epoch": 2.638370305293213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3706703186035156, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7255228757858276, + "num_tokens": 621524844.0, + "step": 24025 + }, + { + "epoch": 2.638480122995827, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1778604984283447, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6885196566581726, + "num_tokens": 621556794.0, + "step": 24026 + }, + { + "epoch": 2.6385899406984406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.705392360687256, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7103703022003174, + "num_tokens": 621576931.0, + "step": 24027 + }, + { + "epoch": 2.6386997584010543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2968485355377197, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7063892483711243, + "num_tokens": 621603944.0, + "step": 24028 + }, + { + "epoch": 2.638809576103668, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.242077589035034, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6979967355728149, + "num_tokens": 621635189.0, + "step": 24029 + }, + { + "epoch": 2.6389193938062814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8007328510284424, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7326892614364624, + "num_tokens": 621656516.0, + "step": 24030 + }, + { + "epoch": 2.639029211508895, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513913869857788, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7097398638725281, + "num_tokens": 621681421.0, + "step": 24031 + }, + { + "epoch": 2.639139029211509, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495631217956543, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6839978098869324, + "num_tokens": 621705867.0, + "step": 24032 + }, + { + "epoch": 2.6392488469141226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.36391282081604, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7179219722747803, + "num_tokens": 621731823.0, + "step": 24033 + }, + { + "epoch": 2.6393586646167364, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8039562702178955, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7345020771026611, + "num_tokens": 621751871.0, + "step": 24034 + }, + { + "epoch": 2.6394684823193497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.286360740661621, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7163453102111816, + "num_tokens": 621778959.0, + "step": 24035 + }, + { + "epoch": 2.6395783000219635, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.418210029602051, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7022808790206909, + "num_tokens": 621806512.0, + "step": 24036 + }, + { + "epoch": 2.6396881177245772, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.0189766883850098, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7253105044364929, + "num_tokens": 621823949.0, + "step": 24037 + }, + { + "epoch": 2.639797935427191, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2854738235473633, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7290002107620239, + "num_tokens": 621851867.0, + "step": 24038 + }, + { + "epoch": 2.6399077531298047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3711166381835938, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7188353538513184, + "num_tokens": 621877830.0, + "step": 24039 + }, + { + "epoch": 2.640017570832418, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.461104154586792, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6973795890808105, + "num_tokens": 621904075.0, + "step": 24040 + }, + { + "epoch": 2.640127388535032, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7816076278686523, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.730213463306427, + "num_tokens": 621923382.0, + "step": 24041 + }, + { + "epoch": 2.6402372062376456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.305363893508911, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7134993672370911, + "num_tokens": 621953238.0, + "step": 24042 + }, + { + "epoch": 2.6403470239402593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8581366539001465, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7266782522201538, + "num_tokens": 621972582.0, + "step": 24043 + }, + { + "epoch": 2.640456841642873, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4781532287597656, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7122094631195068, + "num_tokens": 621997752.0, + "step": 24044 + }, + { + "epoch": 2.6405666593454864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.674696207046509, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7266130447387695, + "num_tokens": 622020119.0, + "step": 24045 + }, + { + "epoch": 2.6406764770481, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.618297815322876, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7202298641204834, + "num_tokens": 622043496.0, + "step": 24046 + }, + { + "epoch": 2.640786294750714, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4259610176086426, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7012642025947571, + "num_tokens": 622070300.0, + "step": 24047 + }, + { + "epoch": 2.6408961124533272, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.62038254737854, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7120072841644287, + "num_tokens": 622094411.0, + "step": 24048 + }, + { + "epoch": 2.641005930155941, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.280714273452759, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6917868256568909, + "num_tokens": 622128698.0, + "step": 24049 + }, + { + "epoch": 2.6411157478585547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.766618013381958, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7138170599937439, + "num_tokens": 622148936.0, + "step": 24050 + }, + { + "epoch": 2.6412255655611685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.287043809890747, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7197367548942566, + "num_tokens": 622176301.0, + "step": 24051 + }, + { + "epoch": 2.6413353832637823, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3879904747009277, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6963909864425659, + "num_tokens": 622208911.0, + "step": 24052 + }, + { + "epoch": 2.6414452009663956, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3948655128479004, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6987773776054382, + "num_tokens": 622236356.0, + "step": 24053 + }, + { + "epoch": 2.6415550186690093, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.288313150405884, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7109517455101013, + "num_tokens": 622262816.0, + "step": 24054 + }, + { + "epoch": 2.641664836371623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6205554008483887, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7013223171234131, + "num_tokens": 622284963.0, + "step": 24055 + }, + { + "epoch": 2.641774654074237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.91402530670166, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7377233505249023, + "num_tokens": 622305749.0, + "step": 24056 + }, + { + "epoch": 2.6418844717768506, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4467625617980957, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7330600023269653, + "num_tokens": 622331647.0, + "step": 24057 + }, + { + "epoch": 2.641994289479464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.617828369140625, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.707241415977478, + "num_tokens": 622354461.0, + "step": 24058 + }, + { + "epoch": 2.6421041071820777, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4537181854248047, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7070875763893127, + "num_tokens": 622378606.0, + "step": 24059 + }, + { + "epoch": 2.6422139248846914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.482307195663452, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7178792357444763, + "num_tokens": 622404972.0, + "step": 24060 + }, + { + "epoch": 2.642323742587305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3301098346710205, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7005444765090942, + "num_tokens": 622437224.0, + "step": 24061 + }, + { + "epoch": 2.642433560289919, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1737101078033447, + "learning_rate": 1e-06, + "loss": 1.0908, + "mean_token_accuracy": 0.6778340339660645, + "num_tokens": 622470711.0, + "step": 24062 + }, + { + "epoch": 2.6425433779925322, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.9219765663146973, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7265359163284302, + "num_tokens": 622495174.0, + "step": 24063 + }, + { + "epoch": 2.642653195695146, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6305010318756104, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7335221767425537, + "num_tokens": 622518425.0, + "step": 24064 + }, + { + "epoch": 2.6427630133977598, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.156280517578125, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7067205309867859, + "num_tokens": 622549915.0, + "step": 24065 + }, + { + "epoch": 2.6428728311003735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6781556606292725, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6908325552940369, + "num_tokens": 622572444.0, + "step": 24066 + }, + { + "epoch": 2.6429826488029873, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7241077423095703, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7336850166320801, + "num_tokens": 622593190.0, + "step": 24067 + }, + { + "epoch": 2.6430924665056006, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6397385597229004, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.712765097618103, + "num_tokens": 622620392.0, + "step": 24068 + }, + { + "epoch": 2.6432022842082143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5491647720336914, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7107397317886353, + "num_tokens": 622649585.0, + "step": 24069 + }, + { + "epoch": 2.643312101910828, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.217383861541748, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7236230373382568, + "num_tokens": 622676984.0, + "step": 24070 + }, + { + "epoch": 2.6434219196134414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2733047008514404, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7053970694541931, + "num_tokens": 622705084.0, + "step": 24071 + }, + { + "epoch": 2.6435317373160556, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4585554599761963, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7093126773834229, + "num_tokens": 622732563.0, + "step": 24072 + }, + { + "epoch": 2.643641555018669, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.591125726699829, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7300387620925903, + "num_tokens": 622757536.0, + "step": 24073 + }, + { + "epoch": 2.6437513727212827, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3507614135742188, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.710502028465271, + "num_tokens": 622787075.0, + "step": 24074 + }, + { + "epoch": 2.6438611904238964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.603400945663452, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.707104504108429, + "num_tokens": 622811946.0, + "step": 24075 + }, + { + "epoch": 2.6439710081265098, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3378336429595947, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7186904549598694, + "num_tokens": 622839809.0, + "step": 24076 + }, + { + "epoch": 2.6440808258291235, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7708725929260254, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.728870689868927, + "num_tokens": 622858369.0, + "step": 24077 + }, + { + "epoch": 2.6441906435317373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.1215217113494873, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7244457006454468, + "num_tokens": 622879353.0, + "step": 24078 + }, + { + "epoch": 2.644300461234351, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7016122341156006, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6975901126861572, + "num_tokens": 622906053.0, + "step": 24079 + }, + { + "epoch": 2.644410278936965, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3621747493743896, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7035788297653198, + "num_tokens": 622935393.0, + "step": 24080 + }, + { + "epoch": 2.644520096639578, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4525516033172607, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7059707641601562, + "num_tokens": 622961271.0, + "step": 24081 + }, + { + "epoch": 2.644629914342192, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.349987030029297, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7289577126502991, + "num_tokens": 622988400.0, + "step": 24082 + }, + { + "epoch": 2.6447397320448056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.658881425857544, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7147396802902222, + "num_tokens": 623011022.0, + "step": 24083 + }, + { + "epoch": 2.6448495497474194, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7673895359039307, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7537751197814941, + "num_tokens": 623030796.0, + "step": 24084 + }, + { + "epoch": 2.644959367450033, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3085520267486572, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6991233825683594, + "num_tokens": 623060043.0, + "step": 24085 + }, + { + "epoch": 2.6450691851526464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4339451789855957, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7322477102279663, + "num_tokens": 623082033.0, + "step": 24086 + }, + { + "epoch": 2.64517900285526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3649203777313232, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7023380398750305, + "num_tokens": 623109445.0, + "step": 24087 + }, + { + "epoch": 2.645288820557874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2992897033691406, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6989589333534241, + "num_tokens": 623138438.0, + "step": 24088 + }, + { + "epoch": 2.6453986382604877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.444869041442871, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7253326177597046, + "num_tokens": 623162892.0, + "step": 24089 + }, + { + "epoch": 2.6455084559631015, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.096327304840088, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7246215343475342, + "num_tokens": 623181200.0, + "step": 24090 + }, + { + "epoch": 2.645618273665715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419644594192505, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7136040925979614, + "num_tokens": 623209572.0, + "step": 24091 + }, + { + "epoch": 2.6457280913683285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.549506187438965, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7037785053253174, + "num_tokens": 623233462.0, + "step": 24092 + }, + { + "epoch": 2.6458379090709423, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2773420810699463, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7126538753509521, + "num_tokens": 623260152.0, + "step": 24093 + }, + { + "epoch": 2.645947726773556, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.287935495376587, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7067753672599792, + "num_tokens": 623290976.0, + "step": 24094 + }, + { + "epoch": 2.64605754447617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.654179573059082, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6985262036323547, + "num_tokens": 623314911.0, + "step": 24095 + }, + { + "epoch": 2.646167362178783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3569395542144775, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7191822528839111, + "num_tokens": 623342413.0, + "step": 24096 + }, + { + "epoch": 2.646277179881397, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277812957763672, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6975163221359253, + "num_tokens": 623371308.0, + "step": 24097 + }, + { + "epoch": 2.6463869975840106, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2345528602600098, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7449864149093628, + "num_tokens": 623398913.0, + "step": 24098 + }, + { + "epoch": 2.646496815286624, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2390270233154297, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6947123408317566, + "num_tokens": 623430023.0, + "step": 24099 + }, + { + "epoch": 2.6466066329892377, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.154566764831543, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7025504112243652, + "num_tokens": 623461192.0, + "step": 24100 + }, + { + "epoch": 2.6467164506918515, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.469667911529541, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7264341115951538, + "num_tokens": 623487347.0, + "step": 24101 + }, + { + "epoch": 2.646826268394465, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4282379150390625, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7049973011016846, + "num_tokens": 623511378.0, + "step": 24102 + }, + { + "epoch": 2.646936086097079, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.466691255569458, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7068623304367065, + "num_tokens": 623535263.0, + "step": 24103 + }, + { + "epoch": 2.6470459037996923, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3789660930633545, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7001024484634399, + "num_tokens": 623562398.0, + "step": 24104 + }, + { + "epoch": 2.647155721502306, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.553410291671753, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7175900936126709, + "num_tokens": 623586222.0, + "step": 24105 + }, + { + "epoch": 2.64726553920492, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.916438579559326, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7081423997879028, + "num_tokens": 623608154.0, + "step": 24106 + }, + { + "epoch": 2.6473753569075336, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5797970294952393, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6898699998855591, + "num_tokens": 623634219.0, + "step": 24107 + }, + { + "epoch": 2.6474851746101473, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6567795276641846, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7072296142578125, + "num_tokens": 623661239.0, + "step": 24108 + }, + { + "epoch": 2.6475949923127606, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.304900884628296, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7371573448181152, + "num_tokens": 623687698.0, + "step": 24109 + }, + { + "epoch": 2.6477048100153744, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.527595043182373, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7173590064048767, + "num_tokens": 623712239.0, + "step": 24110 + }, + { + "epoch": 2.647814627717988, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.627169370651245, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7307491302490234, + "num_tokens": 623734006.0, + "step": 24111 + }, + { + "epoch": 2.647924445420602, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.436185836791992, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.708972692489624, + "num_tokens": 623761280.0, + "step": 24112 + }, + { + "epoch": 2.6480342631232157, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.398944616317749, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7024505138397217, + "num_tokens": 623788872.0, + "step": 24113 + }, + { + "epoch": 2.648144080825829, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.416541337966919, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6914219856262207, + "num_tokens": 623815823.0, + "step": 24114 + }, + { + "epoch": 2.6482538985284427, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393660545349121, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7111443877220154, + "num_tokens": 623841142.0, + "step": 24115 + }, + { + "epoch": 2.6483637162310565, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3676211833953857, + "learning_rate": 1e-06, + "loss": 1.0891, + "mean_token_accuracy": 0.6799763441085815, + "num_tokens": 623868966.0, + "step": 24116 + }, + { + "epoch": 2.6484735339336702, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.645294189453125, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7197014093399048, + "num_tokens": 623889833.0, + "step": 24117 + }, + { + "epoch": 2.648583351636284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.540214776992798, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6973624229431152, + "num_tokens": 623916392.0, + "step": 24118 + }, + { + "epoch": 2.6486931693388973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.463839530944824, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7118009328842163, + "num_tokens": 623941110.0, + "step": 24119 + }, + { + "epoch": 2.648802987041511, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4155006408691406, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7208843231201172, + "num_tokens": 623965608.0, + "step": 24120 + }, + { + "epoch": 2.648912804744125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4426398277282715, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7149471044540405, + "num_tokens": 623992636.0, + "step": 24121 + }, + { + "epoch": 2.649022622446738, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426121711730957, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7280591130256653, + "num_tokens": 624016363.0, + "step": 24122 + }, + { + "epoch": 2.6491324401493523, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3514885902404785, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7299504280090332, + "num_tokens": 624043771.0, + "step": 24123 + }, + { + "epoch": 2.6492422578519657, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.309363842010498, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7070077061653137, + "num_tokens": 624072386.0, + "step": 24124 + }, + { + "epoch": 2.6493520755545794, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.476912021636963, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7153619527816772, + "num_tokens": 624097076.0, + "step": 24125 + }, + { + "epoch": 2.649461893257193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.483529806137085, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7313655614852905, + "num_tokens": 624120131.0, + "step": 24126 + }, + { + "epoch": 2.6495717109598065, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3630521297454834, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7248903512954712, + "num_tokens": 624148254.0, + "step": 24127 + }, + { + "epoch": 2.6496815286624202, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3503618240356445, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6995732188224792, + "num_tokens": 624176985.0, + "step": 24128 + }, + { + "epoch": 2.649791346365034, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.470188856124878, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7180699110031128, + "num_tokens": 624202640.0, + "step": 24129 + }, + { + "epoch": 2.6499011640676478, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.615185022354126, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7291151881217957, + "num_tokens": 624225318.0, + "step": 24130 + }, + { + "epoch": 2.6500109817702615, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.572641611099243, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7173812389373779, + "num_tokens": 624249215.0, + "step": 24131 + }, + { + "epoch": 2.650120799472875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.625833749771118, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7186658382415771, + "num_tokens": 624271947.0, + "step": 24132 + }, + { + "epoch": 2.6502306171754886, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.50435471534729, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7229657769203186, + "num_tokens": 624294819.0, + "step": 24133 + }, + { + "epoch": 2.6503404348781023, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.635347366333008, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7151593565940857, + "num_tokens": 624319693.0, + "step": 24134 + }, + { + "epoch": 2.650450252580716, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2180240154266357, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7128742337226868, + "num_tokens": 624350989.0, + "step": 24135 + }, + { + "epoch": 2.65056007028333, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2580020427703857, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7401008605957031, + "num_tokens": 624380414.0, + "step": 24136 + }, + { + "epoch": 2.650669887985943, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3738110065460205, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7047778367996216, + "num_tokens": 624406399.0, + "step": 24137 + }, + { + "epoch": 2.650779705688557, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3320343494415283, + "learning_rate": 1e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6937402486801147, + "num_tokens": 624433901.0, + "step": 24138 + }, + { + "epoch": 2.6508895233911707, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4756276607513428, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7220910787582397, + "num_tokens": 624458660.0, + "step": 24139 + }, + { + "epoch": 2.6509993410937844, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6262075901031494, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7188479900360107, + "num_tokens": 624481456.0, + "step": 24140 + }, + { + "epoch": 2.651109158796398, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6113932132720947, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7140495777130127, + "num_tokens": 624504676.0, + "step": 24141 + }, + { + "epoch": 2.6512189764990115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2887256145477295, + "learning_rate": 1e-06, + "loss": 1.1252, + "mean_token_accuracy": 0.6789637207984924, + "num_tokens": 624535033.0, + "step": 24142 + }, + { + "epoch": 2.6513287942016253, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6014468669891357, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.722926139831543, + "num_tokens": 624557535.0, + "step": 24143 + }, + { + "epoch": 2.651438611904239, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5849838256835938, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7133175730705261, + "num_tokens": 624583930.0, + "step": 24144 + }, + { + "epoch": 2.6515484296068528, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.606144905090332, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7157286405563354, + "num_tokens": 624607216.0, + "step": 24145 + }, + { + "epoch": 2.6516582473094665, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6879847049713135, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7291899919509888, + "num_tokens": 624629725.0, + "step": 24146 + }, + { + "epoch": 2.65176806501208, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4617385864257812, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7111009955406189, + "num_tokens": 624654995.0, + "step": 24147 + }, + { + "epoch": 2.6518778827146936, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6006112098693848, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7359124422073364, + "num_tokens": 624677595.0, + "step": 24148 + }, + { + "epoch": 2.6519877004173074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.385572910308838, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7020039558410645, + "num_tokens": 624705954.0, + "step": 24149 + }, + { + "epoch": 2.6520975181199207, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3506479263305664, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6993285417556763, + "num_tokens": 624735000.0, + "step": 24150 + }, + { + "epoch": 2.6522073358225344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7618160247802734, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7030797004699707, + "num_tokens": 624756511.0, + "step": 24151 + }, + { + "epoch": 2.652317153525148, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.317617177963257, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6948891282081604, + "num_tokens": 624784308.0, + "step": 24152 + }, + { + "epoch": 2.652426971227762, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.540095090866089, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7122629880905151, + "num_tokens": 624809534.0, + "step": 24153 + }, + { + "epoch": 2.6525367889303757, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3980278968811035, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7299850583076477, + "num_tokens": 624834936.0, + "step": 24154 + }, + { + "epoch": 2.652646606632989, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.775233030319214, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6990140676498413, + "num_tokens": 624864487.0, + "step": 24155 + }, + { + "epoch": 2.6527564243356028, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.538388729095459, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6969907879829407, + "num_tokens": 624889573.0, + "step": 24156 + }, + { + "epoch": 2.6528662420382165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3897714614868164, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6927668452262878, + "num_tokens": 624918369.0, + "step": 24157 + }, + { + "epoch": 2.6529760597408303, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419591188430786, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7218788862228394, + "num_tokens": 624944122.0, + "step": 24158 + }, + { + "epoch": 2.653085877443444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6206629276275635, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7425605654716492, + "num_tokens": 624964762.0, + "step": 24159 + }, + { + "epoch": 2.6531956951460574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.564244508743286, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.7022553086280823, + "num_tokens": 624989315.0, + "step": 24160 + }, + { + "epoch": 2.653305512848671, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.529013156890869, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6942436099052429, + "num_tokens": 625016020.0, + "step": 24161 + }, + { + "epoch": 2.653415330551285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1410346031188965, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7166157960891724, + "num_tokens": 625046600.0, + "step": 24162 + }, + { + "epoch": 2.6535251482538986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.409334659576416, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7109512090682983, + "num_tokens": 625073179.0, + "step": 24163 + }, + { + "epoch": 2.6536349659565124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4791386127471924, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7282840609550476, + "num_tokens": 625100817.0, + "step": 24164 + }, + { + "epoch": 2.6537447836591257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1910550594329834, + "learning_rate": 1e-06, + "loss": 1.072, + "mean_token_accuracy": 0.688900351524353, + "num_tokens": 625132091.0, + "step": 24165 + }, + { + "epoch": 2.6538546013617395, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.57768177986145, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7133982181549072, + "num_tokens": 625156293.0, + "step": 24166 + }, + { + "epoch": 2.653964419064353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3552448749542236, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6920343637466431, + "num_tokens": 625185119.0, + "step": 24167 + }, + { + "epoch": 2.654074236766967, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.302572250366211, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7049164772033691, + "num_tokens": 625216115.0, + "step": 24168 + }, + { + "epoch": 2.6541840544695807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4095280170440674, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7124995589256287, + "num_tokens": 625243558.0, + "step": 24169 + }, + { + "epoch": 2.654293872172194, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.594780921936035, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7211164236068726, + "num_tokens": 625266309.0, + "step": 24170 + }, + { + "epoch": 2.654403689874808, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2644622325897217, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.693966805934906, + "num_tokens": 625295082.0, + "step": 24171 + }, + { + "epoch": 2.6545135075774215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5291950702667236, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.735581636428833, + "num_tokens": 625319200.0, + "step": 24172 + }, + { + "epoch": 2.6546233252800353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4337515830993652, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7394654750823975, + "num_tokens": 625344812.0, + "step": 24173 + }, + { + "epoch": 2.654733142982649, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3228437900543213, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7158931493759155, + "num_tokens": 625371736.0, + "step": 24174 + }, + { + "epoch": 2.6548429606852624, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3325843811035156, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7226386070251465, + "num_tokens": 625400843.0, + "step": 24175 + }, + { + "epoch": 2.654952778387876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.775524377822876, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7057802081108093, + "num_tokens": 625421941.0, + "step": 24176 + }, + { + "epoch": 2.65506259609049, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.63459849357605, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.716804027557373, + "num_tokens": 625443964.0, + "step": 24177 + }, + { + "epoch": 2.655172413793103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.03671407699585, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.729323148727417, + "num_tokens": 625463927.0, + "step": 24178 + }, + { + "epoch": 2.655282231495717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.350818395614624, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7224699854850769, + "num_tokens": 625492348.0, + "step": 24179 + }, + { + "epoch": 2.6553920491983307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.875101089477539, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7213540077209473, + "num_tokens": 625513604.0, + "step": 24180 + }, + { + "epoch": 2.6555018669009445, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.732424020767212, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7160736918449402, + "num_tokens": 625534927.0, + "step": 24181 + }, + { + "epoch": 2.6556116846035582, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5467147827148438, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7312614917755127, + "num_tokens": 625558477.0, + "step": 24182 + }, + { + "epoch": 2.6557215023061715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2983665466308594, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7208619117736816, + "num_tokens": 625585865.0, + "step": 24183 + }, + { + "epoch": 2.6558313200087853, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417577028274536, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7104081511497498, + "num_tokens": 625610298.0, + "step": 24184 + }, + { + "epoch": 2.655941137711399, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4703726768493652, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7203532457351685, + "num_tokens": 625635271.0, + "step": 24185 + }, + { + "epoch": 2.656050955414013, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3095202445983887, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6857932806015015, + "num_tokens": 625667778.0, + "step": 24186 + }, + { + "epoch": 2.6561607731166266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498317241668701, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7045630216598511, + "num_tokens": 625693015.0, + "step": 24187 + }, + { + "epoch": 2.65627059081924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2148349285125732, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.7028884887695312, + "num_tokens": 625726287.0, + "step": 24188 + }, + { + "epoch": 2.6563804085218536, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6672399044036865, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.715839147567749, + "num_tokens": 625747179.0, + "step": 24189 + }, + { + "epoch": 2.6564902262244674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5062777996063232, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7063781023025513, + "num_tokens": 625772030.0, + "step": 24190 + }, + { + "epoch": 2.656600043927081, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3821465969085693, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.706717848777771, + "num_tokens": 625800036.0, + "step": 24191 + }, + { + "epoch": 2.656709861629695, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.497467041015625, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7072838544845581, + "num_tokens": 625828947.0, + "step": 24192 + }, + { + "epoch": 2.6568196793323082, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4548580646514893, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7042949795722961, + "num_tokens": 625855482.0, + "step": 24193 + }, + { + "epoch": 2.656929497034922, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4773452281951904, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7300435304641724, + "num_tokens": 625881288.0, + "step": 24194 + }, + { + "epoch": 2.6570393147375357, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.546016216278076, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6996613144874573, + "num_tokens": 625905574.0, + "step": 24195 + }, + { + "epoch": 2.6571491324401495, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.233062982559204, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7143415212631226, + "num_tokens": 625934805.0, + "step": 24196 + }, + { + "epoch": 2.6572589501427633, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.624567985534668, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7250257730484009, + "num_tokens": 625957195.0, + "step": 24197 + }, + { + "epoch": 2.6573687678453766, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7194900512695312, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7572207450866699, + "num_tokens": 625976035.0, + "step": 24198 + }, + { + "epoch": 2.6574785855479903, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.559479236602783, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7063306570053101, + "num_tokens": 625999073.0, + "step": 24199 + }, + { + "epoch": 2.657588403250604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.423623561859131, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7109503746032715, + "num_tokens": 626026380.0, + "step": 24200 + }, + { + "epoch": 2.6576982209532174, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4430360794067383, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7043372988700867, + "num_tokens": 626052800.0, + "step": 24201 + }, + { + "epoch": 2.6578080386558316, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5825164318084717, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7095821499824524, + "num_tokens": 626076458.0, + "step": 24202 + }, + { + "epoch": 2.657917856358445, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.701213836669922, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7317590117454529, + "num_tokens": 626097548.0, + "step": 24203 + }, + { + "epoch": 2.6580276740610587, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5113751888275146, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7255712151527405, + "num_tokens": 626121697.0, + "step": 24204 + }, + { + "epoch": 2.6581374917636724, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.653766393661499, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.718834400177002, + "num_tokens": 626143533.0, + "step": 24205 + }, + { + "epoch": 2.6582473094662857, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.332301378250122, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7018246650695801, + "num_tokens": 626169981.0, + "step": 24206 + }, + { + "epoch": 2.6583571271688995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3987839221954346, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7064939737319946, + "num_tokens": 626197217.0, + "step": 24207 + }, + { + "epoch": 2.6584669448715132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2135324478149414, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7170133590698242, + "num_tokens": 626229495.0, + "step": 24208 + }, + { + "epoch": 2.658576762574127, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2984776496887207, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7067181468009949, + "num_tokens": 626257022.0, + "step": 24209 + }, + { + "epoch": 2.6586865802767408, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.548053741455078, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7071632742881775, + "num_tokens": 626280771.0, + "step": 24210 + }, + { + "epoch": 2.658796397979354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.613473653793335, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7059351205825806, + "num_tokens": 626305088.0, + "step": 24211 + }, + { + "epoch": 2.658906215681968, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6920526027679443, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.700846791267395, + "num_tokens": 626327391.0, + "step": 24212 + }, + { + "epoch": 2.6590160333845816, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5060036182403564, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7244930267333984, + "num_tokens": 626354456.0, + "step": 24213 + }, + { + "epoch": 2.6591258510871953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3305160999298096, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7253806591033936, + "num_tokens": 626380975.0, + "step": 24214 + }, + { + "epoch": 2.659235668789809, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2538959980010986, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7304677367210388, + "num_tokens": 626408294.0, + "step": 24215 + }, + { + "epoch": 2.6593454864924224, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3633718490600586, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7180231213569641, + "num_tokens": 626436301.0, + "step": 24216 + }, + { + "epoch": 2.659455304195036, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3832755088806152, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7046844363212585, + "num_tokens": 626461838.0, + "step": 24217 + }, + { + "epoch": 2.65956512189765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2085514068603516, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7010873556137085, + "num_tokens": 626491759.0, + "step": 24218 + }, + { + "epoch": 2.6596749396002637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4763941764831543, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7174709439277649, + "num_tokens": 626516598.0, + "step": 24219 + }, + { + "epoch": 2.6597847573028774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4928860664367676, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6953158378601074, + "num_tokens": 626542760.0, + "step": 24220 + }, + { + "epoch": 2.6598945750054908, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.420799493789673, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7218884229660034, + "num_tokens": 626568063.0, + "step": 24221 + }, + { + "epoch": 2.6600043927081045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.09012770652771, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7190228700637817, + "num_tokens": 626602420.0, + "step": 24222 + }, + { + "epoch": 2.6601142104107183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.669144868850708, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7397823929786682, + "num_tokens": 626622888.0, + "step": 24223 + }, + { + "epoch": 2.660224028113332, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0970072746276855, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7275921106338501, + "num_tokens": 626655392.0, + "step": 24224 + }, + { + "epoch": 2.660333845815946, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.162487745285034, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7012705206871033, + "num_tokens": 626687978.0, + "step": 24225 + }, + { + "epoch": 2.660443663518559, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4576447010040283, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7156140208244324, + "num_tokens": 626714453.0, + "step": 24226 + }, + { + "epoch": 2.660553481221173, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.582120895385742, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7182294130325317, + "num_tokens": 626738129.0, + "step": 24227 + }, + { + "epoch": 2.6606632989237866, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.606731414794922, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7162710428237915, + "num_tokens": 626762084.0, + "step": 24228 + }, + { + "epoch": 2.6607731166264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5527377128601074, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7132446765899658, + "num_tokens": 626786533.0, + "step": 24229 + }, + { + "epoch": 2.6608829343290137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5852115154266357, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.719354510307312, + "num_tokens": 626811012.0, + "step": 24230 + }, + { + "epoch": 2.6609927520316274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4770095348358154, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7376279234886169, + "num_tokens": 626835039.0, + "step": 24231 + }, + { + "epoch": 2.661102569734241, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3847482204437256, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7133532762527466, + "num_tokens": 626859162.0, + "step": 24232 + }, + { + "epoch": 2.661212387436855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.278646945953369, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6912301778793335, + "num_tokens": 626890271.0, + "step": 24233 + }, + { + "epoch": 2.6613222051394683, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.666510820388794, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7082210779190063, + "num_tokens": 626916816.0, + "step": 24234 + }, + { + "epoch": 2.661432022842082, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1991188526153564, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6966891288757324, + "num_tokens": 626948544.0, + "step": 24235 + }, + { + "epoch": 2.661541840544696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7038090229034424, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7020347118377686, + "num_tokens": 626969038.0, + "step": 24236 + }, + { + "epoch": 2.6616516582473095, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.123361825942993, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.7075721025466919, + "num_tokens": 627001751.0, + "step": 24237 + }, + { + "epoch": 2.6617614759499233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5484328269958496, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7094501852989197, + "num_tokens": 627025947.0, + "step": 24238 + }, + { + "epoch": 2.6618712936525366, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7706820964813232, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.717470645904541, + "num_tokens": 627045642.0, + "step": 24239 + }, + { + "epoch": 2.6619811113551504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3650522232055664, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7094961404800415, + "num_tokens": 627072345.0, + "step": 24240 + }, + { + "epoch": 2.662090929057764, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.297060251235962, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7203322649002075, + "num_tokens": 627100585.0, + "step": 24241 + }, + { + "epoch": 2.662200746760378, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.451455593109131, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.7009046077728271, + "num_tokens": 627126318.0, + "step": 24242 + }, + { + "epoch": 2.6623105644629916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3390254974365234, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7010961174964905, + "num_tokens": 627153684.0, + "step": 24243 + }, + { + "epoch": 2.662420382165605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.279245615005493, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7109267711639404, + "num_tokens": 627182507.0, + "step": 24244 + }, + { + "epoch": 2.6625301998682187, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.323268175125122, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7534592747688293, + "num_tokens": 627210042.0, + "step": 24245 + }, + { + "epoch": 2.6626400175708325, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.247988224029541, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7456774711608887, + "num_tokens": 627235725.0, + "step": 24246 + }, + { + "epoch": 2.662749835273446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2331972122192383, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7025429010391235, + "num_tokens": 627264374.0, + "step": 24247 + }, + { + "epoch": 2.66285965297606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2915945053100586, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6983663439750671, + "num_tokens": 627292718.0, + "step": 24248 + }, + { + "epoch": 2.6629694706786733, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.227900266647339, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.689741849899292, + "num_tokens": 627323275.0, + "step": 24249 + }, + { + "epoch": 2.663079288381287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5256588459014893, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7308284044265747, + "num_tokens": 627347055.0, + "step": 24250 + }, + { + "epoch": 2.663189106083901, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.330834150314331, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7015130519866943, + "num_tokens": 627375292.0, + "step": 24251 + }, + { + "epoch": 2.663298923786514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3859124183654785, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6952804923057556, + "num_tokens": 627404543.0, + "step": 24252 + }, + { + "epoch": 2.6634087414891283, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.228971004486084, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7071889042854309, + "num_tokens": 627435946.0, + "step": 24253 + }, + { + "epoch": 2.6635185591917416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3862500190734863, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7029780149459839, + "num_tokens": 627463374.0, + "step": 24254 + }, + { + "epoch": 2.6636283768943554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.652567148208618, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7196023464202881, + "num_tokens": 627486138.0, + "step": 24255 + }, + { + "epoch": 2.663738194596969, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.349222183227539, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7136783003807068, + "num_tokens": 627512833.0, + "step": 24256 + }, + { + "epoch": 2.6638480122995825, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2153379917144775, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7033675909042358, + "num_tokens": 627545399.0, + "step": 24257 + }, + { + "epoch": 2.663957830002196, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3407270908355713, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7041885852813721, + "num_tokens": 627571910.0, + "step": 24258 + }, + { + "epoch": 2.66406764770481, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.551743745803833, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7081724405288696, + "num_tokens": 627600017.0, + "step": 24259 + }, + { + "epoch": 2.6641774654074237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.272834539413452, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7099626064300537, + "num_tokens": 627630488.0, + "step": 24260 + }, + { + "epoch": 2.6642872831100375, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5402846336364746, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.7214195132255554, + "num_tokens": 627654978.0, + "step": 24261 + }, + { + "epoch": 2.664397100812651, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.683089017868042, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7023460865020752, + "num_tokens": 627676512.0, + "step": 24262 + }, + { + "epoch": 2.6645069185152646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4344067573547363, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6913883686065674, + "num_tokens": 627702865.0, + "step": 24263 + }, + { + "epoch": 2.6646167362178783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5380334854125977, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7192615866661072, + "num_tokens": 627729759.0, + "step": 24264 + }, + { + "epoch": 2.664726553920492, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.258338212966919, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.699476957321167, + "num_tokens": 627761297.0, + "step": 24265 + }, + { + "epoch": 2.664836371623106, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6502957344055176, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7186983823776245, + "num_tokens": 627783809.0, + "step": 24266 + }, + { + "epoch": 2.664946189325719, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.560004234313965, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7243339419364929, + "num_tokens": 627808469.0, + "step": 24267 + }, + { + "epoch": 2.665056007028333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4434053897857666, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7307027578353882, + "num_tokens": 627832696.0, + "step": 24268 + }, + { + "epoch": 2.6651658247309467, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.497346878051758, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.728270411491394, + "num_tokens": 627857509.0, + "step": 24269 + }, + { + "epoch": 2.6652756424335604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3853797912597656, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7355504035949707, + "num_tokens": 627883020.0, + "step": 24270 + }, + { + "epoch": 2.665385460136174, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4704060554504395, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7117213010787964, + "num_tokens": 627907355.0, + "step": 24271 + }, + { + "epoch": 2.6654952778387875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4623701572418213, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7153431177139282, + "num_tokens": 627934318.0, + "step": 24272 + }, + { + "epoch": 2.6656050955414012, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4191370010375977, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7215089201927185, + "num_tokens": 627962342.0, + "step": 24273 + }, + { + "epoch": 2.665714913244015, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4607901573181152, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6920157670974731, + "num_tokens": 627989770.0, + "step": 24274 + }, + { + "epoch": 2.6658247309466288, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.846355438232422, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7479447722434998, + "num_tokens": 628009019.0, + "step": 24275 + }, + { + "epoch": 2.6659345486492425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.148618459701538, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6866059303283691, + "num_tokens": 628044962.0, + "step": 24276 + }, + { + "epoch": 2.666044366351856, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4972996711730957, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7255578637123108, + "num_tokens": 628069155.0, + "step": 24277 + }, + { + "epoch": 2.6661541840544696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.49774432182312, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7087147235870361, + "num_tokens": 628094640.0, + "step": 24278 + }, + { + "epoch": 2.6662640017570833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4708497524261475, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.705409049987793, + "num_tokens": 628118746.0, + "step": 24279 + }, + { + "epoch": 2.6663738194596966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328115463256836, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7447535991668701, + "num_tokens": 628143374.0, + "step": 24280 + }, + { + "epoch": 2.6664836371623104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.218282461166382, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7290021181106567, + "num_tokens": 628175194.0, + "step": 24281 + }, + { + "epoch": 2.666593454864924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.472200632095337, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7261863946914673, + "num_tokens": 628199375.0, + "step": 24282 + }, + { + "epoch": 2.666703272567538, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.437187671661377, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7237511873245239, + "num_tokens": 628224584.0, + "step": 24283 + }, + { + "epoch": 2.6668130902701517, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5807242393493652, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7045085430145264, + "num_tokens": 628246848.0, + "step": 24284 + }, + { + "epoch": 2.666922907972765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3625974655151367, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6861123442649841, + "num_tokens": 628274470.0, + "step": 24285 + }, + { + "epoch": 2.6670327256753787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4218151569366455, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7310141324996948, + "num_tokens": 628299899.0, + "step": 24286 + }, + { + "epoch": 2.6671425433779925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5163369178771973, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6986595988273621, + "num_tokens": 628323819.0, + "step": 24287 + }, + { + "epoch": 2.6672523610806063, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5920450687408447, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7210720777511597, + "num_tokens": 628346219.0, + "step": 24288 + }, + { + "epoch": 2.66736217878322, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4444243907928467, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7150629758834839, + "num_tokens": 628370534.0, + "step": 24289 + }, + { + "epoch": 2.6674719964858333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3802483081817627, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7041711807250977, + "num_tokens": 628397247.0, + "step": 24290 + }, + { + "epoch": 2.667581814188447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3842756748199463, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7173125743865967, + "num_tokens": 628424134.0, + "step": 24291 + }, + { + "epoch": 2.667691631891061, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2794761657714844, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7357474565505981, + "num_tokens": 628451886.0, + "step": 24292 + }, + { + "epoch": 2.6678014495936746, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4073879718780518, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7160319089889526, + "num_tokens": 628475401.0, + "step": 24293 + }, + { + "epoch": 2.6679112672962884, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6285626888275146, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7127156257629395, + "num_tokens": 628496463.0, + "step": 24294 + }, + { + "epoch": 2.6680210849989017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7197256088256836, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7266390919685364, + "num_tokens": 628518543.0, + "step": 24295 + }, + { + "epoch": 2.6681309027015154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3538661003112793, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7056261301040649, + "num_tokens": 628546183.0, + "step": 24296 + }, + { + "epoch": 2.668240720404129, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5255630016326904, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.715528666973114, + "num_tokens": 628569383.0, + "step": 24297 + }, + { + "epoch": 2.668350538106743, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.504243850708008, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7266008853912354, + "num_tokens": 628592940.0, + "step": 24298 + }, + { + "epoch": 2.6684603558093567, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.363770008087158, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7090656757354736, + "num_tokens": 628620055.0, + "step": 24299 + }, + { + "epoch": 2.66857017351197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3455512523651123, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7184680700302124, + "num_tokens": 628645968.0, + "step": 24300 + }, + { + "epoch": 2.6686799912145838, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5328643321990967, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7124614715576172, + "num_tokens": 628667921.0, + "step": 24301 + }, + { + "epoch": 2.6687898089171975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4563050270080566, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7241044640541077, + "num_tokens": 628693110.0, + "step": 24302 + }, + { + "epoch": 2.668899626619811, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.623481512069702, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7027817964553833, + "num_tokens": 628716271.0, + "step": 24303 + }, + { + "epoch": 2.669009444322425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.399779796600342, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.707216203212738, + "num_tokens": 628741671.0, + "step": 24304 + }, + { + "epoch": 2.6691192620250384, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3937036991119385, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6979113817214966, + "num_tokens": 628768746.0, + "step": 24305 + }, + { + "epoch": 2.669229079727652, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5197412967681885, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7327380180358887, + "num_tokens": 628792673.0, + "step": 24306 + }, + { + "epoch": 2.669338897430266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3550467491149902, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7007462978363037, + "num_tokens": 628817965.0, + "step": 24307 + }, + { + "epoch": 2.669448715132879, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1167478561401367, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6878114938735962, + "num_tokens": 628852368.0, + "step": 24308 + }, + { + "epoch": 2.669558532835493, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5979602336883545, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7165802717208862, + "num_tokens": 628876487.0, + "step": 24309 + }, + { + "epoch": 2.6696683505381067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2705130577087402, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7114335298538208, + "num_tokens": 628904968.0, + "step": 24310 + }, + { + "epoch": 2.6697781682407205, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.188378095626831, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6895304918289185, + "num_tokens": 628936609.0, + "step": 24311 + }, + { + "epoch": 2.669887985943334, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6673736572265625, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7442911267280579, + "num_tokens": 628956272.0, + "step": 24312 + }, + { + "epoch": 2.6699978036459475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469012498855591, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7154205441474915, + "num_tokens": 628978794.0, + "step": 24313 + }, + { + "epoch": 2.6701076213485613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5350749492645264, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7075566649436951, + "num_tokens": 629004138.0, + "step": 24314 + }, + { + "epoch": 2.670217439051175, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.582908868789673, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7091566324234009, + "num_tokens": 629030168.0, + "step": 24315 + }, + { + "epoch": 2.670327256753789, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2829394340515137, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7120041251182556, + "num_tokens": 629057981.0, + "step": 24316 + }, + { + "epoch": 2.6704370744564025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2885446548461914, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7181640863418579, + "num_tokens": 629086282.0, + "step": 24317 + }, + { + "epoch": 2.670546892159016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5129506587982178, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7226182222366333, + "num_tokens": 629111573.0, + "step": 24318 + }, + { + "epoch": 2.6706567098616296, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.612168312072754, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7149507403373718, + "num_tokens": 629134667.0, + "step": 24319 + }, + { + "epoch": 2.6707665275642434, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1040260791778564, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7027199268341064, + "num_tokens": 629169421.0, + "step": 24320 + }, + { + "epoch": 2.670876345266857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6515276432037354, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.727125346660614, + "num_tokens": 629190802.0, + "step": 24321 + }, + { + "epoch": 2.670986162969471, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.562525749206543, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7080835700035095, + "num_tokens": 629214455.0, + "step": 24322 + }, + { + "epoch": 2.671095980672084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396728277206421, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7248460054397583, + "num_tokens": 629239574.0, + "step": 24323 + }, + { + "epoch": 2.671205798374698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.398533344268799, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6975332498550415, + "num_tokens": 629266362.0, + "step": 24324 + }, + { + "epoch": 2.6713156160773117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4087793827056885, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7191460728645325, + "num_tokens": 629290907.0, + "step": 24325 + }, + { + "epoch": 2.6714254337799255, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4702348709106445, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7155793905258179, + "num_tokens": 629317945.0, + "step": 24326 + }, + { + "epoch": 2.6715352514825392, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.604067325592041, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7185139060020447, + "num_tokens": 629340176.0, + "step": 24327 + }, + { + "epoch": 2.6716450691851525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.439023017883301, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6838269233703613, + "num_tokens": 629367833.0, + "step": 24328 + }, + { + "epoch": 2.6717548868877663, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4454355239868164, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7033944725990295, + "num_tokens": 629395065.0, + "step": 24329 + }, + { + "epoch": 2.67186470459038, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.495871067047119, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7170020937919617, + "num_tokens": 629420042.0, + "step": 24330 + }, + { + "epoch": 2.6719745222929934, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 32.38520812988281, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7403366565704346, + "num_tokens": 629438858.0, + "step": 24331 + }, + { + "epoch": 2.672084339995607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5345494747161865, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7192668914794922, + "num_tokens": 629463482.0, + "step": 24332 + }, + { + "epoch": 2.672194157698221, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4986915588378906, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7039502263069153, + "num_tokens": 629489477.0, + "step": 24333 + }, + { + "epoch": 2.6723039754008346, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2432188987731934, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7325566411018372, + "num_tokens": 629516528.0, + "step": 24334 + }, + { + "epoch": 2.6724137931034484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6494548320770264, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7319604158401489, + "num_tokens": 629538327.0, + "step": 24335 + }, + { + "epoch": 2.6725236108060617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.826956033706665, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7224462032318115, + "num_tokens": 629558109.0, + "step": 24336 + }, + { + "epoch": 2.6726334285086755, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43999981880188, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7338619828224182, + "num_tokens": 629581221.0, + "step": 24337 + }, + { + "epoch": 2.6727432462112892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3697917461395264, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7028000354766846, + "num_tokens": 629610989.0, + "step": 24338 + }, + { + "epoch": 2.672853063913903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.46134352684021, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6811877489089966, + "num_tokens": 629639140.0, + "step": 24339 + }, + { + "epoch": 2.6729628816165167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.769850015640259, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7309558391571045, + "num_tokens": 629658917.0, + "step": 24340 + }, + { + "epoch": 2.67307269931913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6297404766082764, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7240757942199707, + "num_tokens": 629683062.0, + "step": 24341 + }, + { + "epoch": 2.673182517021744, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1478147506713867, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6895212531089783, + "num_tokens": 629717786.0, + "step": 24342 + }, + { + "epoch": 2.6732923347243576, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.637415647506714, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7247824668884277, + "num_tokens": 629738750.0, + "step": 24343 + }, + { + "epoch": 2.6734021524269713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7129805088043213, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.700425922870636, + "num_tokens": 629759552.0, + "step": 24344 + }, + { + "epoch": 2.673511970129585, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3721017837524414, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7249493598937988, + "num_tokens": 629787628.0, + "step": 24345 + }, + { + "epoch": 2.6736217878321984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3981831073760986, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7209289073944092, + "num_tokens": 629812867.0, + "step": 24346 + }, + { + "epoch": 2.673731605534812, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3966946601867676, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7089970111846924, + "num_tokens": 629839150.0, + "step": 24347 + }, + { + "epoch": 2.673841423237426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5211496353149414, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7109602689743042, + "num_tokens": 629863973.0, + "step": 24348 + }, + { + "epoch": 2.6739512409400397, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469785451889038, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7142554521560669, + "num_tokens": 629888021.0, + "step": 24349 + }, + { + "epoch": 2.6740610586426534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1790404319763184, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6977182626724243, + "num_tokens": 629920231.0, + "step": 24350 + }, + { + "epoch": 2.6741708763452667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4110753536224365, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7092710137367249, + "num_tokens": 629947745.0, + "step": 24351 + }, + { + "epoch": 2.6742806940478805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6866917610168457, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7139587998390198, + "num_tokens": 629969679.0, + "step": 24352 + }, + { + "epoch": 2.6743905117504942, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.263380527496338, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6940393447875977, + "num_tokens": 630001784.0, + "step": 24353 + }, + { + "epoch": 2.674500329453108, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2833499908447266, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6992559432983398, + "num_tokens": 630033023.0, + "step": 24354 + }, + { + "epoch": 2.6746101471557218, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7239904403686523, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7071020603179932, + "num_tokens": 630059274.0, + "step": 24355 + }, + { + "epoch": 2.674719964858335, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.416997194290161, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7067745923995972, + "num_tokens": 630085439.0, + "step": 24356 + }, + { + "epoch": 2.674829782560949, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.416353702545166, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6977791786193848, + "num_tokens": 630111706.0, + "step": 24357 + }, + { + "epoch": 2.6749396002635626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3279781341552734, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7100024223327637, + "num_tokens": 630140800.0, + "step": 24358 + }, + { + "epoch": 2.675049417966176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2655935287475586, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7057472467422485, + "num_tokens": 630170430.0, + "step": 24359 + }, + { + "epoch": 2.6751592356687897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.618468999862671, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.724090039730072, + "num_tokens": 630192434.0, + "step": 24360 + }, + { + "epoch": 2.6752690533714034, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.459810495376587, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7194311618804932, + "num_tokens": 630216439.0, + "step": 24361 + }, + { + "epoch": 2.675378871074017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3496651649475098, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7004127502441406, + "num_tokens": 630244907.0, + "step": 24362 + }, + { + "epoch": 2.675488688776631, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3117949962615967, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.714763879776001, + "num_tokens": 630273273.0, + "step": 24363 + }, + { + "epoch": 2.6755985064792442, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.372910737991333, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7106627821922302, + "num_tokens": 630299721.0, + "step": 24364 + }, + { + "epoch": 2.675708324181858, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5020411014556885, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7070949673652649, + "num_tokens": 630325033.0, + "step": 24365 + }, + { + "epoch": 2.6758181418844718, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.441976547241211, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7043054103851318, + "num_tokens": 630351318.0, + "step": 24366 + }, + { + "epoch": 2.6759279595870855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.301947832107544, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7085551023483276, + "num_tokens": 630378922.0, + "step": 24367 + }, + { + "epoch": 2.6760377772896993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2699646949768066, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6994795203208923, + "num_tokens": 630408757.0, + "step": 24368 + }, + { + "epoch": 2.6761475949923126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4613735675811768, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7278292775154114, + "num_tokens": 630435441.0, + "step": 24369 + }, + { + "epoch": 2.6762574126949263, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.374464988708496, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6979780197143555, + "num_tokens": 630462981.0, + "step": 24370 + }, + { + "epoch": 2.67636723039754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8902745246887207, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7331218123435974, + "num_tokens": 630484003.0, + "step": 24371 + }, + { + "epoch": 2.676477048100154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5285067558288574, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7113231420516968, + "num_tokens": 630510191.0, + "step": 24372 + }, + { + "epoch": 2.6765868658027676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.659576654434204, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7205213308334351, + "num_tokens": 630532773.0, + "step": 24373 + }, + { + "epoch": 2.676696683505381, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4519639015197754, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7253110408782959, + "num_tokens": 630557174.0, + "step": 24374 + }, + { + "epoch": 2.6768065012079947, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5647683143615723, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7244054675102234, + "num_tokens": 630580489.0, + "step": 24375 + }, + { + "epoch": 2.6769163189106084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2837300300598145, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7107114791870117, + "num_tokens": 630609870.0, + "step": 24376 + }, + { + "epoch": 2.677026136613222, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7238454818725586, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.713225245475769, + "num_tokens": 630630976.0, + "step": 24377 + }, + { + "epoch": 2.677135954315836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.806501626968384, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7294814586639404, + "num_tokens": 630651152.0, + "step": 24378 + }, + { + "epoch": 2.6772457720184493, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3828210830688477, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.694845974445343, + "num_tokens": 630679285.0, + "step": 24379 + }, + { + "epoch": 2.677355589721063, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3923161029815674, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7011800408363342, + "num_tokens": 630706399.0, + "step": 24380 + }, + { + "epoch": 2.677465407423677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.335944890975952, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7505922317504883, + "num_tokens": 630732261.0, + "step": 24381 + }, + { + "epoch": 2.67757522512629, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.476606845855713, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7248480319976807, + "num_tokens": 630755814.0, + "step": 24382 + }, + { + "epoch": 2.6776850428289043, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3316407203674316, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.7067949175834656, + "num_tokens": 630782783.0, + "step": 24383 + }, + { + "epoch": 2.6777948605315176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.173586845397949, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6918689608573914, + "num_tokens": 630815627.0, + "step": 24384 + }, + { + "epoch": 2.6779046782341314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.1742115020751953, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7359554171562195, + "num_tokens": 630832948.0, + "step": 24385 + }, + { + "epoch": 2.678014495936745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.104870319366455, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7200814485549927, + "num_tokens": 630860164.0, + "step": 24386 + }, + { + "epoch": 2.6781243136393584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3908984661102295, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7102587223052979, + "num_tokens": 630889955.0, + "step": 24387 + }, + { + "epoch": 2.678234131341972, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.408855676651001, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.698929488658905, + "num_tokens": 630917747.0, + "step": 24388 + }, + { + "epoch": 2.678343949044586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3614301681518555, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7024522423744202, + "num_tokens": 630945446.0, + "step": 24389 + }, + { + "epoch": 2.6784537667471997, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.422060012817383, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6902201771736145, + "num_tokens": 630973759.0, + "step": 24390 + }, + { + "epoch": 2.6785635844498135, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4186034202575684, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.7022337913513184, + "num_tokens": 631001578.0, + "step": 24391 + }, + { + "epoch": 2.6786734021524268, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.130779981613159, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7118558883666992, + "num_tokens": 631032517.0, + "step": 24392 + }, + { + "epoch": 2.6787832198550405, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7082200050354004, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7114397287368774, + "num_tokens": 631054209.0, + "step": 24393 + }, + { + "epoch": 2.6788930375576543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.904123067855835, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7345633506774902, + "num_tokens": 631073210.0, + "step": 24394 + }, + { + "epoch": 2.679002855260268, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4498870372772217, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7199931144714355, + "num_tokens": 631099917.0, + "step": 24395 + }, + { + "epoch": 2.679112672962882, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4740240573883057, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6845923066139221, + "num_tokens": 631128902.0, + "step": 24396 + }, + { + "epoch": 2.679222490665495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.33742356300354, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6921087503433228, + "num_tokens": 631157803.0, + "step": 24397 + }, + { + "epoch": 2.679332308368109, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.7075064182281494, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7192121744155884, + "num_tokens": 631177743.0, + "step": 24398 + }, + { + "epoch": 2.6794421260707226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.557605266571045, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.724985659122467, + "num_tokens": 631201585.0, + "step": 24399 + }, + { + "epoch": 2.6795519437733364, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1004905700683594, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7238637804985046, + "num_tokens": 631235262.0, + "step": 24400 + }, + { + "epoch": 2.67966176147595, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.664381742477417, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.705272912979126, + "num_tokens": 631257057.0, + "step": 24401 + }, + { + "epoch": 2.6797715791785635, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1235976219177246, + "learning_rate": 1e-06, + "loss": 1.1146, + "mean_token_accuracy": 0.6976786255836487, + "num_tokens": 631287863.0, + "step": 24402 + }, + { + "epoch": 2.679881396881177, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4689486026763916, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7076038122177124, + "num_tokens": 631312985.0, + "step": 24403 + }, + { + "epoch": 2.679991214583791, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6281163692474365, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7128475904464722, + "num_tokens": 631337562.0, + "step": 24404 + }, + { + "epoch": 2.6801010322864047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7520744800567627, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7272834777832031, + "num_tokens": 631357719.0, + "step": 24405 + }, + { + "epoch": 2.6802108499890185, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1762452125549316, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7145900726318359, + "num_tokens": 631389130.0, + "step": 24406 + }, + { + "epoch": 2.680320667691632, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7036542892456055, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7228493094444275, + "num_tokens": 631410423.0, + "step": 24407 + }, + { + "epoch": 2.6804304853942456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3544788360595703, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7327792048454285, + "num_tokens": 631437968.0, + "step": 24408 + }, + { + "epoch": 2.6805403030968593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.316528081893921, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7449715733528137, + "num_tokens": 631463678.0, + "step": 24409 + }, + { + "epoch": 2.6806501207994726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.186619997024536, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7198125720024109, + "num_tokens": 631494062.0, + "step": 24410 + }, + { + "epoch": 2.6807599385020864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4696245193481445, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7176704406738281, + "num_tokens": 631519307.0, + "step": 24411 + }, + { + "epoch": 2.6808697562047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5881431102752686, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7253603935241699, + "num_tokens": 631542890.0, + "step": 24412 + }, + { + "epoch": 2.680979573907314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.149700164794922, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.685975968837738, + "num_tokens": 631577584.0, + "step": 24413 + }, + { + "epoch": 2.6810893916099277, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.450413703918457, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7050249576568604, + "num_tokens": 631602628.0, + "step": 24414 + }, + { + "epoch": 2.681199209312541, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5612664222717285, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7246147394180298, + "num_tokens": 631625565.0, + "step": 24415 + }, + { + "epoch": 2.6813090270151547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6269805431365967, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7199472784996033, + "num_tokens": 631648466.0, + "step": 24416 + }, + { + "epoch": 2.6814188447177685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5309369564056396, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7059460282325745, + "num_tokens": 631672970.0, + "step": 24417 + }, + { + "epoch": 2.6815286624203822, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3836560249328613, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7009899020195007, + "num_tokens": 631700133.0, + "step": 24418 + }, + { + "epoch": 2.681638480122996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417914867401123, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6883094310760498, + "num_tokens": 631726860.0, + "step": 24419 + }, + { + "epoch": 2.6817482978256093, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2613677978515625, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7074766755104065, + "num_tokens": 631754956.0, + "step": 24420 + }, + { + "epoch": 2.681858115528223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.799865484237671, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7412226796150208, + "num_tokens": 631775773.0, + "step": 24421 + }, + { + "epoch": 2.681967933230837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.594099998474121, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7226991057395935, + "num_tokens": 631798551.0, + "step": 24422 + }, + { + "epoch": 2.6820777509334506, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.35677433013916, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7125791907310486, + "num_tokens": 631826646.0, + "step": 24423 + }, + { + "epoch": 2.6821875686360643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.143366813659668, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7425963878631592, + "num_tokens": 631858987.0, + "step": 24424 + }, + { + "epoch": 2.6822973863386776, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526611328125, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7108246684074402, + "num_tokens": 631882069.0, + "step": 24425 + }, + { + "epoch": 2.6824072040412914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2672324180603027, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7125715017318726, + "num_tokens": 631911029.0, + "step": 24426 + }, + { + "epoch": 2.682517021743905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5198280811309814, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6849980354309082, + "num_tokens": 631937513.0, + "step": 24427 + }, + { + "epoch": 2.682626839446519, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.268693447113037, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7313013076782227, + "num_tokens": 631964628.0, + "step": 24428 + }, + { + "epoch": 2.6827366571491327, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.620575189590454, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7102378606796265, + "num_tokens": 631987694.0, + "step": 24429 + }, + { + "epoch": 2.682846474851746, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4466280937194824, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6962078809738159, + "num_tokens": 632018211.0, + "step": 24430 + }, + { + "epoch": 2.6829562925543597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.389439582824707, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7043273448944092, + "num_tokens": 632047901.0, + "step": 24431 + }, + { + "epoch": 2.6830661102569735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.613032341003418, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7403552532196045, + "num_tokens": 632068329.0, + "step": 24432 + }, + { + "epoch": 2.683175927959587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4886510372161865, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7071458697319031, + "num_tokens": 632094069.0, + "step": 24433 + }, + { + "epoch": 2.683285745662201, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2480642795562744, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7114671468734741, + "num_tokens": 632123887.0, + "step": 24434 + }, + { + "epoch": 2.6833955633648143, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.431483745574951, + "learning_rate": 1e-06, + "loss": 1.1187, + "mean_token_accuracy": 0.6790294051170349, + "num_tokens": 632151927.0, + "step": 24435 + }, + { + "epoch": 2.683505381067428, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8896846771240234, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7255474328994751, + "num_tokens": 632172273.0, + "step": 24436 + }, + { + "epoch": 2.683615198770042, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.474071979522705, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7062593698501587, + "num_tokens": 632200268.0, + "step": 24437 + }, + { + "epoch": 2.683725016472655, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2502810955047607, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6974623799324036, + "num_tokens": 632232815.0, + "step": 24438 + }, + { + "epoch": 2.683834834175269, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.635296583175659, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7139506936073303, + "num_tokens": 632256901.0, + "step": 24439 + }, + { + "epoch": 2.6839446518778827, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4174368381500244, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7136189937591553, + "num_tokens": 632282897.0, + "step": 24440 + }, + { + "epoch": 2.6840544695804964, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5312387943267822, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7159323692321777, + "num_tokens": 632307063.0, + "step": 24441 + }, + { + "epoch": 2.68416428728311, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.349639415740967, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7162479758262634, + "num_tokens": 632336231.0, + "step": 24442 + }, + { + "epoch": 2.6842741049857235, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2856993675231934, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.721565842628479, + "num_tokens": 632365393.0, + "step": 24443 + }, + { + "epoch": 2.6843839226883373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.405174970626831, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6848108172416687, + "num_tokens": 632392173.0, + "step": 24444 + }, + { + "epoch": 2.684493740390951, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4593262672424316, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.698144257068634, + "num_tokens": 632419194.0, + "step": 24445 + }, + { + "epoch": 2.6846035580935648, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1763672828674316, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7123186588287354, + "num_tokens": 632452581.0, + "step": 24446 + }, + { + "epoch": 2.6847133757961785, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2232396602630615, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6903035640716553, + "num_tokens": 632485790.0, + "step": 24447 + }, + { + "epoch": 2.684823193498792, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.832404613494873, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7326457500457764, + "num_tokens": 632504942.0, + "step": 24448 + }, + { + "epoch": 2.6849330112014056, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.468830108642578, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7321937680244446, + "num_tokens": 632528163.0, + "step": 24449 + }, + { + "epoch": 2.6850428289040194, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.260105848312378, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7034399509429932, + "num_tokens": 632556386.0, + "step": 24450 + }, + { + "epoch": 2.685152646606633, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.815716028213501, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7195509672164917, + "num_tokens": 632578355.0, + "step": 24451 + }, + { + "epoch": 2.685262464309247, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.376220941543579, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7402032613754272, + "num_tokens": 632603086.0, + "step": 24452 + }, + { + "epoch": 2.68537228201186, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4180855751037598, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7172902822494507, + "num_tokens": 632631197.0, + "step": 24453 + }, + { + "epoch": 2.685482099714474, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3364193439483643, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7066631317138672, + "num_tokens": 632659157.0, + "step": 24454 + }, + { + "epoch": 2.6855919174170877, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4368793964385986, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6955125331878662, + "num_tokens": 632684876.0, + "step": 24455 + }, + { + "epoch": 2.6857017351197014, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.8075432777404785, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.730849027633667, + "num_tokens": 632704324.0, + "step": 24456 + }, + { + "epoch": 2.685811552822315, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.298532009124756, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7188839912414551, + "num_tokens": 632732603.0, + "step": 24457 + }, + { + "epoch": 2.6859213705249285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.168774366378784, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.708499550819397, + "num_tokens": 632766272.0, + "step": 24458 + }, + { + "epoch": 2.6860311882275423, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4117348194122314, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7143348455429077, + "num_tokens": 632793284.0, + "step": 24459 + }, + { + "epoch": 2.686141005930156, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2695043087005615, + "learning_rate": 1e-06, + "loss": 1.1054, + "mean_token_accuracy": 0.6824635863304138, + "num_tokens": 632822643.0, + "step": 24460 + }, + { + "epoch": 2.6862508236327693, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4129130840301514, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7036669254302979, + "num_tokens": 632848903.0, + "step": 24461 + }, + { + "epoch": 2.686360641335383, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5567846298217773, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7382751703262329, + "num_tokens": 632871012.0, + "step": 24462 + }, + { + "epoch": 2.686470459037997, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5781686305999756, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7267112731933594, + "num_tokens": 632893130.0, + "step": 24463 + }, + { + "epoch": 2.6865802767406106, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2998523712158203, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6977269053459167, + "num_tokens": 632921317.0, + "step": 24464 + }, + { + "epoch": 2.6866900944432244, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3492588996887207, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7188308835029602, + "num_tokens": 632949586.0, + "step": 24465 + }, + { + "epoch": 2.6867999121458377, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5493054389953613, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6922625303268433, + "num_tokens": 632975352.0, + "step": 24466 + }, + { + "epoch": 2.6869097298484514, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.201829671859741, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7298290729522705, + "num_tokens": 633004096.0, + "step": 24467 + }, + { + "epoch": 2.687019547551065, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.263612747192383, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6925147771835327, + "num_tokens": 633035934.0, + "step": 24468 + }, + { + "epoch": 2.687129365253679, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1483442783355713, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7230294942855835, + "num_tokens": 633066967.0, + "step": 24469 + }, + { + "epoch": 2.6872391829562927, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6017420291900635, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7089428901672363, + "num_tokens": 633091102.0, + "step": 24470 + }, + { + "epoch": 2.687349000658906, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.304809093475342, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7346498370170593, + "num_tokens": 633115293.0, + "step": 24471 + }, + { + "epoch": 2.68745881836152, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.590656042098999, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7215917706489563, + "num_tokens": 633138373.0, + "step": 24472 + }, + { + "epoch": 2.6875686360641335, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4726245403289795, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.721228837966919, + "num_tokens": 633162004.0, + "step": 24473 + }, + { + "epoch": 2.6876784537667473, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4195191860198975, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7240848541259766, + "num_tokens": 633185919.0, + "step": 24474 + }, + { + "epoch": 2.687788271469361, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3770177364349365, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7109402418136597, + "num_tokens": 633214269.0, + "step": 24475 + }, + { + "epoch": 2.6878980891719744, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3166279792785645, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7076133489608765, + "num_tokens": 633242678.0, + "step": 24476 + }, + { + "epoch": 2.688007906874588, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.713946580886841, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6996930837631226, + "num_tokens": 633264820.0, + "step": 24477 + }, + { + "epoch": 2.688117724577202, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0804710388183594, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.699993371963501, + "num_tokens": 633298596.0, + "step": 24478 + }, + { + "epoch": 2.6882275422798156, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3194994926452637, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7246400117874146, + "num_tokens": 633325792.0, + "step": 24479 + }, + { + "epoch": 2.6883373599824294, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6014351844787598, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7218726873397827, + "num_tokens": 633348353.0, + "step": 24480 + }, + { + "epoch": 2.6884471776850427, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5505199432373047, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7249754071235657, + "num_tokens": 633371318.0, + "step": 24481 + }, + { + "epoch": 2.6885569953876565, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.1906545162200928, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7137791514396667, + "num_tokens": 633400630.0, + "step": 24482 + }, + { + "epoch": 2.6886668130902702, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.363980531692505, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7420613169670105, + "num_tokens": 633425392.0, + "step": 24483 + }, + { + "epoch": 2.6887766307928835, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6237826347351074, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7133526802062988, + "num_tokens": 633450601.0, + "step": 24484 + }, + { + "epoch": 2.6888864484954977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464501142501831, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7187743186950684, + "num_tokens": 633475397.0, + "step": 24485 + }, + { + "epoch": 2.688996266198111, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.436859130859375, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7143405079841614, + "num_tokens": 633500874.0, + "step": 24486 + }, + { + "epoch": 2.689106083900725, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2966926097869873, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6956034898757935, + "num_tokens": 633530432.0, + "step": 24487 + }, + { + "epoch": 2.6892159016033386, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4204509258270264, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7197365164756775, + "num_tokens": 633557693.0, + "step": 24488 + }, + { + "epoch": 2.689325719305952, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.297353982925415, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7271391153335571, + "num_tokens": 633584872.0, + "step": 24489 + }, + { + "epoch": 2.6894355370085656, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.778488874435425, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7359274625778198, + "num_tokens": 633605096.0, + "step": 24490 + }, + { + "epoch": 2.6895453547111794, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5784032344818115, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.722943902015686, + "num_tokens": 633628223.0, + "step": 24491 + }, + { + "epoch": 2.689655172413793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4024498462677, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7178411483764648, + "num_tokens": 633654837.0, + "step": 24492 + }, + { + "epoch": 2.689764990116407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5528550148010254, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7087490558624268, + "num_tokens": 633678396.0, + "step": 24493 + }, + { + "epoch": 2.68987480781902, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2483901977539062, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7049089670181274, + "num_tokens": 633709747.0, + "step": 24494 + }, + { + "epoch": 2.689984625521634, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3610126972198486, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6861013770103455, + "num_tokens": 633738224.0, + "step": 24495 + }, + { + "epoch": 2.6900944432242477, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2934699058532715, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6873054504394531, + "num_tokens": 633765948.0, + "step": 24496 + }, + { + "epoch": 2.6902042609268615, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.518651008605957, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7228155732154846, + "num_tokens": 633789191.0, + "step": 24497 + }, + { + "epoch": 2.6903140786294752, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.958730936050415, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7441931962966919, + "num_tokens": 633806814.0, + "step": 24498 + }, + { + "epoch": 2.6904238963320886, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.280287981033325, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7187461256980896, + "num_tokens": 633836908.0, + "step": 24499 + }, + { + "epoch": 2.6905337140347023, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5926787853240967, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7398014068603516, + "num_tokens": 633859780.0, + "step": 24500 + }, + { + "epoch": 2.690643531737316, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.547477960586548, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6903964281082153, + "num_tokens": 633885859.0, + "step": 24501 + }, + { + "epoch": 2.69075334943993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6446995735168457, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7143663167953491, + "num_tokens": 633909121.0, + "step": 24502 + }, + { + "epoch": 2.6908631671425436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324288845062256, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6833481788635254, + "num_tokens": 633938167.0, + "step": 24503 + }, + { + "epoch": 2.690972984845157, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3099584579467773, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7296079397201538, + "num_tokens": 633965109.0, + "step": 24504 + }, + { + "epoch": 2.6910828025477707, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.468010425567627, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7135644555091858, + "num_tokens": 633988185.0, + "step": 24505 + }, + { + "epoch": 2.6911926202503844, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.642209053039551, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6982352137565613, + "num_tokens": 634012442.0, + "step": 24506 + }, + { + "epoch": 2.691302437952998, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5156195163726807, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7236037254333496, + "num_tokens": 634036849.0, + "step": 24507 + }, + { + "epoch": 2.691412255655612, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5550875663757324, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.7026078701019287, + "num_tokens": 634061913.0, + "step": 24508 + }, + { + "epoch": 2.6915220733582252, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4315524101257324, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6880737543106079, + "num_tokens": 634088505.0, + "step": 24509 + }, + { + "epoch": 2.691631891060839, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.2930331230163574, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7090091705322266, + "num_tokens": 634119812.0, + "step": 24510 + }, + { + "epoch": 2.6917417087634528, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.422635793685913, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.718981146812439, + "num_tokens": 634144470.0, + "step": 24511 + }, + { + "epoch": 2.691851526466066, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.092359781265259, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.7042280435562134, + "num_tokens": 634178419.0, + "step": 24512 + }, + { + "epoch": 2.69196134416868, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.610792398452759, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7041915059089661, + "num_tokens": 634204048.0, + "step": 24513 + }, + { + "epoch": 2.6920711618712936, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.6414120197296143, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7378081679344177, + "num_tokens": 634224837.0, + "step": 24514 + }, + { + "epoch": 2.6921809795739073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3850631713867188, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7035923600196838, + "num_tokens": 634251640.0, + "step": 24515 + }, + { + "epoch": 2.692290797276521, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.26847767829895, + "learning_rate": 1e-06, + "loss": 1.1361, + "mean_token_accuracy": 0.6726293563842773, + "num_tokens": 634285192.0, + "step": 24516 + }, + { + "epoch": 2.6924006149791344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.25051212310791, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7139874696731567, + "num_tokens": 634313160.0, + "step": 24517 + }, + { + "epoch": 2.692510432681748, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6454789638519287, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7413804531097412, + "num_tokens": 634334500.0, + "step": 24518 + }, + { + "epoch": 2.692620250384362, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.304259777069092, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7104405164718628, + "num_tokens": 634361753.0, + "step": 24519 + }, + { + "epoch": 2.6927300680869757, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4563636779785156, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7265181541442871, + "num_tokens": 634387044.0, + "step": 24520 + }, + { + "epoch": 2.6928398857895894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2816929817199707, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6986583471298218, + "num_tokens": 634416653.0, + "step": 24521 + }, + { + "epoch": 2.6929497034922028, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.512383460998535, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7244006991386414, + "num_tokens": 634440434.0, + "step": 24522 + }, + { + "epoch": 2.6930595211948165, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.3758292198181152, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7092255353927612, + "num_tokens": 634466287.0, + "step": 24523 + }, + { + "epoch": 2.6931693388974303, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.427278757095337, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7259921431541443, + "num_tokens": 634491980.0, + "step": 24524 + }, + { + "epoch": 2.693279156600044, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.4847512245178223, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7242347598075867, + "num_tokens": 634515161.0, + "step": 24525 + }, + { + "epoch": 2.693388974302658, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.63948655128479, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7122984528541565, + "num_tokens": 634536852.0, + "step": 24526 + }, + { + "epoch": 2.693498792005271, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.50290846824646, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7231205105781555, + "num_tokens": 634561649.0, + "step": 24527 + }, + { + "epoch": 2.693608609707885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3458166122436523, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7317910194396973, + "num_tokens": 634586384.0, + "step": 24528 + }, + { + "epoch": 2.6937184274104986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2574033737182617, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7128229141235352, + "num_tokens": 634617927.0, + "step": 24529 + }, + { + "epoch": 2.6938282451131124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.378359317779541, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7263081669807434, + "num_tokens": 634644921.0, + "step": 24530 + }, + { + "epoch": 2.693938062815726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6809213161468506, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7382245063781738, + "num_tokens": 634665892.0, + "step": 24531 + }, + { + "epoch": 2.6940478805183394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2272725105285645, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7214572429656982, + "num_tokens": 634696236.0, + "step": 24532 + }, + { + "epoch": 2.694157698220953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.25677490234375, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7229170799255371, + "num_tokens": 634725664.0, + "step": 24533 + }, + { + "epoch": 2.694267515923567, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.566422700881958, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7166144847869873, + "num_tokens": 634749180.0, + "step": 24534 + }, + { + "epoch": 2.6943773336261807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68669056892395, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7227776646614075, + "num_tokens": 634769158.0, + "step": 24535 + }, + { + "epoch": 2.6944871513287945, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.552095651626587, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7124090194702148, + "num_tokens": 634791663.0, + "step": 24536 + }, + { + "epoch": 2.6945969690314078, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4967758655548096, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7157361507415771, + "num_tokens": 634817045.0, + "step": 24537 + }, + { + "epoch": 2.6947067867340215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.746208906173706, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7158008813858032, + "num_tokens": 634838364.0, + "step": 24538 + }, + { + "epoch": 2.6948166044366353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.38218092918396, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6956223249435425, + "num_tokens": 634867696.0, + "step": 24539 + }, + { + "epoch": 2.6949264221392486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.80808162689209, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.712285578250885, + "num_tokens": 634887761.0, + "step": 24540 + }, + { + "epoch": 2.6950362398418624, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5484673976898193, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7194005250930786, + "num_tokens": 634910623.0, + "step": 24541 + }, + { + "epoch": 2.695146057544476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5085086822509766, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.713555634021759, + "num_tokens": 634933794.0, + "step": 24542 + }, + { + "epoch": 2.69525587524709, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3944344520568848, + "learning_rate": 1e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6855727434158325, + "num_tokens": 634963445.0, + "step": 24543 + }, + { + "epoch": 2.6953656929497036, + "ewc_loss": 2.2292137145996094e-05, + "grad_norm": 2.5559537410736084, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7274676561355591, + "num_tokens": 634984866.0, + "step": 24544 + }, + { + "epoch": 2.695475510652317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3649702072143555, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7220970392227173, + "num_tokens": 635012109.0, + "step": 24545 + }, + { + "epoch": 2.6955853283549307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3352558612823486, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.7040690779685974, + "num_tokens": 635040466.0, + "step": 24546 + }, + { + "epoch": 2.6956951460575445, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5475165843963623, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7202222943305969, + "num_tokens": 635064335.0, + "step": 24547 + }, + { + "epoch": 2.695804963760158, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3812673091888428, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7059568166732788, + "num_tokens": 635091295.0, + "step": 24548 + }, + { + "epoch": 2.695914781462772, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2794225215911865, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6990303993225098, + "num_tokens": 635122405.0, + "step": 24549 + }, + { + "epoch": 2.6960245991653853, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7318623065948486, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7395654320716858, + "num_tokens": 635143309.0, + "step": 24550 + }, + { + "epoch": 2.696134416867999, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.304530382156372, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7209941148757935, + "num_tokens": 635172218.0, + "step": 24551 + }, + { + "epoch": 2.696244234570613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.098975419998169, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6905049085617065, + "num_tokens": 635206805.0, + "step": 24552 + }, + { + "epoch": 2.6963540522732266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3694941997528076, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7124974131584167, + "num_tokens": 635233987.0, + "step": 24553 + }, + { + "epoch": 2.6964638699758403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.706235885620117, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6945221424102783, + "num_tokens": 635257790.0, + "step": 24554 + }, + { + "epoch": 2.6965736876784536, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.698124647140503, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7189773917198181, + "num_tokens": 635280021.0, + "step": 24555 + }, + { + "epoch": 2.6966835053810674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328972578048706, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6979050040245056, + "num_tokens": 635309456.0, + "step": 24556 + }, + { + "epoch": 2.696793323083681, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5106773376464844, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7165444493293762, + "num_tokens": 635331597.0, + "step": 24557 + }, + { + "epoch": 2.696903140786295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.814784526824951, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7191213369369507, + "num_tokens": 635352324.0, + "step": 24558 + }, + { + "epoch": 2.6970129584889087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6876909732818604, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7257868051528931, + "num_tokens": 635373940.0, + "step": 24559 + }, + { + "epoch": 2.697122776191522, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.154364824295044, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7037152051925659, + "num_tokens": 635407931.0, + "step": 24560 + }, + { + "epoch": 2.6972325938941357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.167963743209839, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.710128128528595, + "num_tokens": 635437344.0, + "step": 24561 + }, + { + "epoch": 2.6973424115967495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.585479259490967, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7015928030014038, + "num_tokens": 635462421.0, + "step": 24562 + }, + { + "epoch": 2.697452229299363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8564343452453613, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7293030619621277, + "num_tokens": 635481273.0, + "step": 24563 + }, + { + "epoch": 2.697562047001977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4749302864074707, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7093912363052368, + "num_tokens": 635505387.0, + "step": 24564 + }, + { + "epoch": 2.6976718647045903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.448291301727295, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6954547166824341, + "num_tokens": 635533811.0, + "step": 24565 + }, + { + "epoch": 2.697781682407204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3879127502441406, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7135426998138428, + "num_tokens": 635561271.0, + "step": 24566 + }, + { + "epoch": 2.697891500109818, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5191152095794678, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7045533061027527, + "num_tokens": 635584208.0, + "step": 24567 + }, + { + "epoch": 2.698001317812431, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.252721071243286, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7382660508155823, + "num_tokens": 635611111.0, + "step": 24568 + }, + { + "epoch": 2.698111135515045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3520121574401855, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7230427861213684, + "num_tokens": 635638897.0, + "step": 24569 + }, + { + "epoch": 2.6982209532176586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2208471298217773, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7186199426651001, + "num_tokens": 635670072.0, + "step": 24570 + }, + { + "epoch": 2.6983307709202724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7789323329925537, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7318453788757324, + "num_tokens": 635688613.0, + "step": 24571 + }, + { + "epoch": 2.698440588622886, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5752575397491455, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7212121486663818, + "num_tokens": 635711296.0, + "step": 24572 + }, + { + "epoch": 2.6985504063254995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.331629514694214, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7162584066390991, + "num_tokens": 635741297.0, + "step": 24573 + }, + { + "epoch": 2.6986602240281132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2054035663604736, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.7026975154876709, + "num_tokens": 635776310.0, + "step": 24574 + }, + { + "epoch": 2.698770041730727, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.47719669342041, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.710885226726532, + "num_tokens": 635799526.0, + "step": 24575 + }, + { + "epoch": 2.6988798594333407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3560943603515625, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.700349748134613, + "num_tokens": 635827342.0, + "step": 24576 + }, + { + "epoch": 2.6989896771359545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4016494750976562, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7168751955032349, + "num_tokens": 635853904.0, + "step": 24577 + }, + { + "epoch": 2.699099494838568, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.557882308959961, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7131504416465759, + "num_tokens": 635882300.0, + "step": 24578 + }, + { + "epoch": 2.6992093125411816, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.431471109390259, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7268965840339661, + "num_tokens": 635911264.0, + "step": 24579 + }, + { + "epoch": 2.6993191302437953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.397871971130371, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7168854475021362, + "num_tokens": 635937409.0, + "step": 24580 + }, + { + "epoch": 2.699428947946409, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419234275817871, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7094162702560425, + "num_tokens": 635962176.0, + "step": 24581 + }, + { + "epoch": 2.699538765649023, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.52958083152771, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.724822461605072, + "num_tokens": 635985408.0, + "step": 24582 + }, + { + "epoch": 2.699648583351636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.673161268234253, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7306256890296936, + "num_tokens": 636008460.0, + "step": 24583 + }, + { + "epoch": 2.69975840105425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.47660493850708, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6997541785240173, + "num_tokens": 636033600.0, + "step": 24584 + }, + { + "epoch": 2.6998682187568637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2336316108703613, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6863250732421875, + "num_tokens": 636064531.0, + "step": 24585 + }, + { + "epoch": 2.6999780364594774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4542863368988037, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7195526361465454, + "num_tokens": 636088199.0, + "step": 24586 + }, + { + "epoch": 2.700087854162091, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4709513187408447, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7208085060119629, + "num_tokens": 636114960.0, + "step": 24587 + }, + { + "epoch": 2.7001976718647045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4062650203704834, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6934472918510437, + "num_tokens": 636142518.0, + "step": 24588 + }, + { + "epoch": 2.7003074895673183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2783730030059814, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.715341329574585, + "num_tokens": 636170558.0, + "step": 24589 + }, + { + "epoch": 2.700417307269932, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6191179752349854, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7197448015213013, + "num_tokens": 636195178.0, + "step": 24590 + }, + { + "epoch": 2.7005271249725453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.693295955657959, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6856344938278198, + "num_tokens": 636219507.0, + "step": 24591 + }, + { + "epoch": 2.700636942675159, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3050765991210938, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6993330121040344, + "num_tokens": 636249262.0, + "step": 24592 + }, + { + "epoch": 2.700746760377773, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2837560176849365, + "learning_rate": 1e-06, + "loss": 1.1293, + "mean_token_accuracy": 0.6796289682388306, + "num_tokens": 636282270.0, + "step": 24593 + }, + { + "epoch": 2.7008565780803866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.399850368499756, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6914277076721191, + "num_tokens": 636309437.0, + "step": 24594 + }, + { + "epoch": 2.7009663957830004, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5414578914642334, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7297515869140625, + "num_tokens": 636334045.0, + "step": 24595 + }, + { + "epoch": 2.7010762134856137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.662384033203125, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7107983827590942, + "num_tokens": 636356655.0, + "step": 24596 + }, + { + "epoch": 2.7011860311882274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.454657793045044, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7374792098999023, + "num_tokens": 636380594.0, + "step": 24597 + }, + { + "epoch": 2.701295848890841, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.310209035873413, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7080181837081909, + "num_tokens": 636409274.0, + "step": 24598 + }, + { + "epoch": 2.701405666593455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6156327724456787, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7265746593475342, + "num_tokens": 636431881.0, + "step": 24599 + }, + { + "epoch": 2.7015154842960687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5328052043914795, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7038496732711792, + "num_tokens": 636456702.0, + "step": 24600 + }, + { + "epoch": 2.701625301998682, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.609241485595703, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7462465167045593, + "num_tokens": 636477902.0, + "step": 24601 + }, + { + "epoch": 2.7017351197012958, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3737728595733643, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7116749882698059, + "num_tokens": 636506103.0, + "step": 24602 + }, + { + "epoch": 2.7018449374039095, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3546805381774902, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7154840230941772, + "num_tokens": 636534483.0, + "step": 24603 + }, + { + "epoch": 2.7019547551065233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4036991596221924, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7201275825500488, + "num_tokens": 636560499.0, + "step": 24604 + }, + { + "epoch": 2.702064572809137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4439828395843506, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.710503101348877, + "num_tokens": 636587021.0, + "step": 24605 + }, + { + "epoch": 2.7021743905117503, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.395094871520996, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.718531608581543, + "num_tokens": 636611808.0, + "step": 24606 + }, + { + "epoch": 2.702284208214364, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.175797462463379, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7119952440261841, + "num_tokens": 636642235.0, + "step": 24607 + }, + { + "epoch": 2.702394025916978, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4946329593658447, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.7060924172401428, + "num_tokens": 636668339.0, + "step": 24608 + }, + { + "epoch": 2.7025038436195916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4614267349243164, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7056453227996826, + "num_tokens": 636694031.0, + "step": 24609 + }, + { + "epoch": 2.7026136613222054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.316734552383423, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7209808826446533, + "num_tokens": 636721196.0, + "step": 24610 + }, + { + "epoch": 2.7027234790248187, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.390596866607666, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7080872654914856, + "num_tokens": 636744077.0, + "step": 24611 + }, + { + "epoch": 2.7028332967274324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.361318588256836, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7076162099838257, + "num_tokens": 636771047.0, + "step": 24612 + }, + { + "epoch": 2.702943114430046, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.402862071990967, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7262120842933655, + "num_tokens": 636797984.0, + "step": 24613 + }, + { + "epoch": 2.7030529321326595, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621220111846924, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7232049703598022, + "num_tokens": 636820040.0, + "step": 24614 + }, + { + "epoch": 2.7031627498352737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3475253582000732, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.732452392578125, + "num_tokens": 636847213.0, + "step": 24615 + }, + { + "epoch": 2.703272567537887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4527907371520996, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7166545391082764, + "num_tokens": 636873425.0, + "step": 24616 + }, + { + "epoch": 2.703382385240501, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5423126220703125, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7181953191757202, + "num_tokens": 636895224.0, + "step": 24617 + }, + { + "epoch": 2.7034922029431145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3021392822265625, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6936522126197815, + "num_tokens": 636925505.0, + "step": 24618 + }, + { + "epoch": 2.703602020645728, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.344161033630371, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7135810256004333, + "num_tokens": 636950985.0, + "step": 24619 + }, + { + "epoch": 2.7037118383483416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.706378221511841, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.719763457775116, + "num_tokens": 636972902.0, + "step": 24620 + }, + { + "epoch": 2.7038216560509554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.602076768875122, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7111644744873047, + "num_tokens": 636995027.0, + "step": 24621 + }, + { + "epoch": 2.703931473753569, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.228327751159668, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6971046924591064, + "num_tokens": 637025443.0, + "step": 24622 + }, + { + "epoch": 2.704041291456183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3627047538757324, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7173946499824524, + "num_tokens": 637053295.0, + "step": 24623 + }, + { + "epoch": 2.704151109158796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3640809059143066, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7182058095932007, + "num_tokens": 637078599.0, + "step": 24624 + }, + { + "epoch": 2.70426092686141, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2842700481414795, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7061935663223267, + "num_tokens": 637109623.0, + "step": 24625 + }, + { + "epoch": 2.7043707445640237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4790139198303223, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7108213305473328, + "num_tokens": 637132960.0, + "step": 24626 + }, + { + "epoch": 2.7044805622666375, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417980670928955, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7264046669006348, + "num_tokens": 637159563.0, + "step": 24627 + }, + { + "epoch": 2.7045903799692512, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.12768292427063, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7272518277168274, + "num_tokens": 637189795.0, + "step": 24628 + }, + { + "epoch": 2.7047001976718645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2305874824523926, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7014694809913635, + "num_tokens": 637217854.0, + "step": 24629 + }, + { + "epoch": 2.7048100153744783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.199035167694092, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7451356053352356, + "num_tokens": 637246068.0, + "step": 24630 + }, + { + "epoch": 2.704919833077092, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3067712783813477, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6991484761238098, + "num_tokens": 637275907.0, + "step": 24631 + }, + { + "epoch": 2.705029650779706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5956075191497803, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7106792330741882, + "num_tokens": 637298231.0, + "step": 24632 + }, + { + "epoch": 2.7051394684823196, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4030375480651855, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7261320352554321, + "num_tokens": 637324818.0, + "step": 24633 + }, + { + "epoch": 2.705249286184933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.336754322052002, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7264937162399292, + "num_tokens": 637351693.0, + "step": 24634 + }, + { + "epoch": 2.7053591038875466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.452105760574341, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7320965528488159, + "num_tokens": 637376108.0, + "step": 24635 + }, + { + "epoch": 2.7054689215901604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.535996437072754, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7108742594718933, + "num_tokens": 637402645.0, + "step": 24636 + }, + { + "epoch": 2.705578739292774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5049479007720947, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7011263370513916, + "num_tokens": 637427056.0, + "step": 24637 + }, + { + "epoch": 2.705688556995388, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2568776607513428, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7056534886360168, + "num_tokens": 637457793.0, + "step": 24638 + }, + { + "epoch": 2.705798374698001, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.451570510864258, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7311527729034424, + "num_tokens": 637481962.0, + "step": 24639 + }, + { + "epoch": 2.705908192400615, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8415932655334473, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.729799211025238, + "num_tokens": 637502153.0, + "step": 24640 + }, + { + "epoch": 2.7060180101032287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.359328508377075, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.69873046875, + "num_tokens": 637529072.0, + "step": 24641 + }, + { + "epoch": 2.706127827805842, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7677910327911377, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7340559959411621, + "num_tokens": 637549332.0, + "step": 24642 + }, + { + "epoch": 2.706237645508456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.683318853378296, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7421641945838928, + "num_tokens": 637571205.0, + "step": 24643 + }, + { + "epoch": 2.7063474632110696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.379375457763672, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7038434743881226, + "num_tokens": 637599018.0, + "step": 24644 + }, + { + "epoch": 2.7064572809136833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3777730464935303, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6991293430328369, + "num_tokens": 637626618.0, + "step": 24645 + }, + { + "epoch": 2.706567098616297, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2761518955230713, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7090463638305664, + "num_tokens": 637655494.0, + "step": 24646 + }, + { + "epoch": 2.7066769163189104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.494521141052246, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7133100628852844, + "num_tokens": 637680457.0, + "step": 24647 + }, + { + "epoch": 2.706786734021524, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3369033336639404, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.71575927734375, + "num_tokens": 637708205.0, + "step": 24648 + }, + { + "epoch": 2.706896551724138, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.162968158721924, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7332476377487183, + "num_tokens": 637725835.0, + "step": 24649 + }, + { + "epoch": 2.7070063694267517, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.158421516418457, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.712731122970581, + "num_tokens": 637757318.0, + "step": 24650 + }, + { + "epoch": 2.7071161871293654, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2547574043273926, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7089670896530151, + "num_tokens": 637786045.0, + "step": 24651 + }, + { + "epoch": 2.7072260048319787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3570268154144287, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7174687385559082, + "num_tokens": 637814344.0, + "step": 24652 + }, + { + "epoch": 2.7073358225345925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.166426658630371, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7022964954376221, + "num_tokens": 637851542.0, + "step": 24653 + }, + { + "epoch": 2.7074456402372062, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3841285705566406, + "learning_rate": 1e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.682270884513855, + "num_tokens": 637880353.0, + "step": 24654 + }, + { + "epoch": 2.70755545793982, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4492716789245605, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7081563472747803, + "num_tokens": 637905596.0, + "step": 24655 + }, + { + "epoch": 2.7076652756424338, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5215342044830322, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7104244232177734, + "num_tokens": 637930958.0, + "step": 24656 + }, + { + "epoch": 2.707775093345047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.718087673187256, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7257200479507446, + "num_tokens": 637952486.0, + "step": 24657 + }, + { + "epoch": 2.707884911047661, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.421337127685547, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6899474859237671, + "num_tokens": 637980785.0, + "step": 24658 + }, + { + "epoch": 2.7079947287502746, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2947041988372803, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7024073600769043, + "num_tokens": 638012640.0, + "step": 24659 + }, + { + "epoch": 2.7081045464528883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.61401629447937, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7313321232795715, + "num_tokens": 638035306.0, + "step": 24660 + }, + { + "epoch": 2.708214364155502, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.298581123352051, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7120809555053711, + "num_tokens": 638064193.0, + "step": 24661 + }, + { + "epoch": 2.7083241818581154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.529244899749756, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7307721376419067, + "num_tokens": 638088972.0, + "step": 24662 + }, + { + "epoch": 2.708433999560729, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6924853324890137, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7218442559242249, + "num_tokens": 638110260.0, + "step": 24663 + }, + { + "epoch": 2.708543817263343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.569589376449585, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7172296643257141, + "num_tokens": 638132932.0, + "step": 24664 + }, + { + "epoch": 2.7086536349659562, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4976677894592285, + "learning_rate": 1e-06, + "loss": 1.1017, + "mean_token_accuracy": 0.6868252158164978, + "num_tokens": 638159454.0, + "step": 24665 + }, + { + "epoch": 2.7087634526685704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.428150177001953, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7225486636161804, + "num_tokens": 638183545.0, + "step": 24666 + }, + { + "epoch": 2.7088732703711838, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1649580001831055, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7137280702590942, + "num_tokens": 638215187.0, + "step": 24667 + }, + { + "epoch": 2.7089830880737975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.454237699508667, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7100510597229004, + "num_tokens": 638240218.0, + "step": 24668 + }, + { + "epoch": 2.7090929057764113, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.224167823791504, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7149075269699097, + "num_tokens": 638271344.0, + "step": 24669 + }, + { + "epoch": 2.7092027234790246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1148972511291504, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7087607383728027, + "num_tokens": 638303577.0, + "step": 24670 + }, + { + "epoch": 2.7093125411816383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2882299423217773, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6980056762695312, + "num_tokens": 638330973.0, + "step": 24671 + }, + { + "epoch": 2.709422358884252, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.283752918243408, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7159738540649414, + "num_tokens": 638357139.0, + "step": 24672 + }, + { + "epoch": 2.709532176586866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.2473623752593994, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7174970507621765, + "num_tokens": 638373815.0, + "step": 24673 + }, + { + "epoch": 2.7096419942894796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.532062530517578, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6876471638679504, + "num_tokens": 638399938.0, + "step": 24674 + }, + { + "epoch": 2.709751811992093, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.463163137435913, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7254904508590698, + "num_tokens": 638422808.0, + "step": 24675 + }, + { + "epoch": 2.7098616296947067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.208188056945801, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7413196563720703, + "num_tokens": 638450788.0, + "step": 24676 + }, + { + "epoch": 2.7099714473973204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.477814197540283, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.712104320526123, + "num_tokens": 638478890.0, + "step": 24677 + }, + { + "epoch": 2.710081265099934, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2821145057678223, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.719309389591217, + "num_tokens": 638510335.0, + "step": 24678 + }, + { + "epoch": 2.710191082802548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5186052322387695, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7152962684631348, + "num_tokens": 638533854.0, + "step": 24679 + }, + { + "epoch": 2.7103009005051613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.364565849304199, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6924082040786743, + "num_tokens": 638561507.0, + "step": 24680 + }, + { + "epoch": 2.710410718207775, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513169765472412, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7180007696151733, + "num_tokens": 638586013.0, + "step": 24681 + }, + { + "epoch": 2.7105205359103888, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4194962978363037, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7363899946212769, + "num_tokens": 638609740.0, + "step": 24682 + }, + { + "epoch": 2.7106303536130025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393463134765625, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.7056781649589539, + "num_tokens": 638636714.0, + "step": 24683 + }, + { + "epoch": 2.7107401713156163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4721198081970215, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7268257141113281, + "num_tokens": 638661166.0, + "step": 24684 + }, + { + "epoch": 2.7108499890182296, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2206943035125732, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7046512365341187, + "num_tokens": 638689317.0, + "step": 24685 + }, + { + "epoch": 2.7109598067208434, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4693028926849365, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6904374957084656, + "num_tokens": 638714501.0, + "step": 24686 + }, + { + "epoch": 2.711069624423457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4888057708740234, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6999078989028931, + "num_tokens": 638739101.0, + "step": 24687 + }, + { + "epoch": 2.711179442126071, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4873034954071045, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.699546217918396, + "num_tokens": 638766889.0, + "step": 24688 + }, + { + "epoch": 2.7112892598286846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6707849502563477, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7154847383499146, + "num_tokens": 638789471.0, + "step": 24689 + }, + { + "epoch": 2.711399077531298, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.643174409866333, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7181401252746582, + "num_tokens": 638810942.0, + "step": 24690 + }, + { + "epoch": 2.7115088952339117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2375640869140625, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7145297527313232, + "num_tokens": 638843824.0, + "step": 24691 + }, + { + "epoch": 2.7116187129365255, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.144008159637451, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7128009796142578, + "num_tokens": 638874574.0, + "step": 24692 + }, + { + "epoch": 2.7117285306391388, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2972960472106934, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7100940942764282, + "num_tokens": 638903284.0, + "step": 24693 + }, + { + "epoch": 2.7118383483417525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.595829963684082, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7013570070266724, + "num_tokens": 638928789.0, + "step": 24694 + }, + { + "epoch": 2.7119481660443663, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.17948842048645, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6935137510299683, + "num_tokens": 638960128.0, + "step": 24695 + }, + { + "epoch": 2.71205798374698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.330225944519043, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7174249291419983, + "num_tokens": 638986557.0, + "step": 24696 + }, + { + "epoch": 2.712167801449594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1239206790924072, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6977470517158508, + "num_tokens": 639019260.0, + "step": 24697 + }, + { + "epoch": 2.712277619152207, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1431734561920166, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7088959217071533, + "num_tokens": 639051229.0, + "step": 24698 + }, + { + "epoch": 2.712387436854821, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2726120948791504, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7112330198287964, + "num_tokens": 639080680.0, + "step": 24699 + }, + { + "epoch": 2.7124972545574346, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1328465938568115, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7237169742584229, + "num_tokens": 639112770.0, + "step": 24700 + }, + { + "epoch": 2.7126070722600484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.742119312286377, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7153574228286743, + "num_tokens": 639132757.0, + "step": 24701 + }, + { + "epoch": 2.712716889962662, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.500277519226074, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7271919846534729, + "num_tokens": 639155224.0, + "step": 24702 + }, + { + "epoch": 2.7128267076652754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3511955738067627, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7117774486541748, + "num_tokens": 639180807.0, + "step": 24703 + }, + { + "epoch": 2.712936525367889, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8278870582580566, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7045600414276123, + "num_tokens": 639200014.0, + "step": 24704 + }, + { + "epoch": 2.713046343070503, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7527565956115723, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7375540733337402, + "num_tokens": 639219260.0, + "step": 24705 + }, + { + "epoch": 2.7131561607731167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277318000793457, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7026811838150024, + "num_tokens": 639250650.0, + "step": 24706 + }, + { + "epoch": 2.7132659784757305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393730401992798, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7193347811698914, + "num_tokens": 639275641.0, + "step": 24707 + }, + { + "epoch": 2.713375796178344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.606872797012329, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7164998650550842, + "num_tokens": 639297757.0, + "step": 24708 + }, + { + "epoch": 2.7134856138809575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2442333698272705, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7141020894050598, + "num_tokens": 639325714.0, + "step": 24709 + }, + { + "epoch": 2.7135954315835713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7067995071411133, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7274848222732544, + "num_tokens": 639345003.0, + "step": 24710 + }, + { + "epoch": 2.713705249286185, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.387238025665283, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7154014110565186, + "num_tokens": 639369979.0, + "step": 24711 + }, + { + "epoch": 2.713815066988799, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3123836517333984, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.7114845514297485, + "num_tokens": 639398053.0, + "step": 24712 + }, + { + "epoch": 2.713924884691412, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.248345375061035, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6973831653594971, + "num_tokens": 639428209.0, + "step": 24713 + }, + { + "epoch": 2.714034702394026, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3087711334228516, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7239324450492859, + "num_tokens": 639454434.0, + "step": 24714 + }, + { + "epoch": 2.7141445200966396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5501794815063477, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7447880506515503, + "num_tokens": 639476779.0, + "step": 24715 + }, + { + "epoch": 2.7142543377992534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.389047622680664, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7218923568725586, + "num_tokens": 639503392.0, + "step": 24716 + }, + { + "epoch": 2.714364155501867, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.409593105316162, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7379026412963867, + "num_tokens": 639527342.0, + "step": 24717 + }, + { + "epoch": 2.7144739732044805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6520535945892334, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7272301912307739, + "num_tokens": 639550694.0, + "step": 24718 + }, + { + "epoch": 2.7145837909070942, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.038464069366455, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7166245579719543, + "num_tokens": 639585039.0, + "step": 24719 + }, + { + "epoch": 2.714693608609708, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6321921348571777, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7277171611785889, + "num_tokens": 639607137.0, + "step": 24720 + }, + { + "epoch": 2.7148034263123213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2828965187072754, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7136020660400391, + "num_tokens": 639635501.0, + "step": 24721 + }, + { + "epoch": 2.714913244014935, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3021442890167236, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7030194997787476, + "num_tokens": 639665461.0, + "step": 24722 + }, + { + "epoch": 2.715023061717549, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3690736293792725, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6991904377937317, + "num_tokens": 639693491.0, + "step": 24723 + }, + { + "epoch": 2.7151328794201626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6925995349884033, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7163005471229553, + "num_tokens": 639715777.0, + "step": 24724 + }, + { + "epoch": 2.7152426971227763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430088520050049, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7142747640609741, + "num_tokens": 639741348.0, + "step": 24725 + }, + { + "epoch": 2.7153525148253896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.518202304840088, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7161393761634827, + "num_tokens": 639764709.0, + "step": 24726 + }, + { + "epoch": 2.7154623325280034, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2159435749053955, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7278045415878296, + "num_tokens": 639793037.0, + "step": 24727 + }, + { + "epoch": 2.715572150230617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3930106163024902, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6973004937171936, + "num_tokens": 639820215.0, + "step": 24728 + }, + { + "epoch": 2.715681967933231, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.583698034286499, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7093008160591125, + "num_tokens": 639842429.0, + "step": 24729 + }, + { + "epoch": 2.7157917856358447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337740659713745, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7119078040122986, + "num_tokens": 639873513.0, + "step": 24730 + }, + { + "epoch": 2.715901603338458, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4366133213043213, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6921157836914062, + "num_tokens": 639899481.0, + "step": 24731 + }, + { + "epoch": 2.7160114210410717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.438124656677246, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7120387554168701, + "num_tokens": 639924799.0, + "step": 24732 + }, + { + "epoch": 2.7161212387436855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.325481653213501, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7080974578857422, + "num_tokens": 639951117.0, + "step": 24733 + }, + { + "epoch": 2.7162310564462993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3675906658172607, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7210173606872559, + "num_tokens": 639974759.0, + "step": 24734 + }, + { + "epoch": 2.716340874148913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7032358646392822, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.708803653717041, + "num_tokens": 639997730.0, + "step": 24735 + }, + { + "epoch": 2.7164506918515263, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.659107208251953, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7175191640853882, + "num_tokens": 640020029.0, + "step": 24736 + }, + { + "epoch": 2.71656050955414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3927528858184814, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7154924869537354, + "num_tokens": 640048122.0, + "step": 24737 + }, + { + "epoch": 2.716670327256754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.269534111022949, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6966437101364136, + "num_tokens": 640077578.0, + "step": 24738 + }, + { + "epoch": 2.7167801449593676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4137187004089355, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7447015047073364, + "num_tokens": 640102825.0, + "step": 24739 + }, + { + "epoch": 2.7168899626619814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.39593768119812, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7244908213615417, + "num_tokens": 640132048.0, + "step": 24740 + }, + { + "epoch": 2.7169997803645947, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.687701463699341, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7151674628257751, + "num_tokens": 640156823.0, + "step": 24741 + }, + { + "epoch": 2.7171095980672084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5269100666046143, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7039994597434998, + "num_tokens": 640180505.0, + "step": 24742 + }, + { + "epoch": 2.717219415769822, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68392276763916, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7432711124420166, + "num_tokens": 640199882.0, + "step": 24743 + }, + { + "epoch": 2.7173292334724355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3225173950195312, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7242358922958374, + "num_tokens": 640228809.0, + "step": 24744 + }, + { + "epoch": 2.7174390511750497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4216291904449463, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.7002897262573242, + "num_tokens": 640256072.0, + "step": 24745 + }, + { + "epoch": 2.717548868877663, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.371906042098999, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7328873872756958, + "num_tokens": 640282519.0, + "step": 24746 + }, + { + "epoch": 2.7176586865802768, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5224947929382324, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7026209235191345, + "num_tokens": 640307450.0, + "step": 24747 + }, + { + "epoch": 2.7177685042828905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.643497943878174, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7157180905342102, + "num_tokens": 640329891.0, + "step": 24748 + }, + { + "epoch": 2.717878321985504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0663976669311523, + "learning_rate": 1e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.685254693031311, + "num_tokens": 640364993.0, + "step": 24749 + }, + { + "epoch": 2.7179881396881176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3486876487731934, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7151947021484375, + "num_tokens": 640392942.0, + "step": 24750 + }, + { + "epoch": 2.7180979573907313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3886735439300537, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7042980790138245, + "num_tokens": 640419676.0, + "step": 24751 + }, + { + "epoch": 2.718207775093345, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3576064109802246, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7194629311561584, + "num_tokens": 640448091.0, + "step": 24752 + }, + { + "epoch": 2.718317592795959, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3507697582244873, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.7000445127487183, + "num_tokens": 640477778.0, + "step": 24753 + }, + { + "epoch": 2.718427410498572, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.613436698913574, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7122869491577148, + "num_tokens": 640500398.0, + "step": 24754 + }, + { + "epoch": 2.718537228201186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396103858947754, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7249115705490112, + "num_tokens": 640528757.0, + "step": 24755 + }, + { + "epoch": 2.7186470459037997, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.562676191329956, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6908259987831116, + "num_tokens": 640554774.0, + "step": 24756 + }, + { + "epoch": 2.7187568636064134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.31569766998291, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7056053876876831, + "num_tokens": 640582467.0, + "step": 24757 + }, + { + "epoch": 2.718866681309027, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.15146803855896, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7073893547058105, + "num_tokens": 640614279.0, + "step": 24758 + }, + { + "epoch": 2.7189764990116405, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3814897537231445, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7081209421157837, + "num_tokens": 640642335.0, + "step": 24759 + }, + { + "epoch": 2.7190863167142543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.37917423248291, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6941485404968262, + "num_tokens": 640669322.0, + "step": 24760 + }, + { + "epoch": 2.719196134416868, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8920974731445312, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7138454914093018, + "num_tokens": 640687288.0, + "step": 24761 + }, + { + "epoch": 2.719305952119482, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5980043411254883, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.711820662021637, + "num_tokens": 640710975.0, + "step": 24762 + }, + { + "epoch": 2.7194157698220955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0316569805145264, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7112404108047485, + "num_tokens": 640747982.0, + "step": 24763 + }, + { + "epoch": 2.719525587524709, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.431849479675293, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7383770942687988, + "num_tokens": 640775056.0, + "step": 24764 + }, + { + "epoch": 2.7196354052273226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3124001026153564, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7029989957809448, + "num_tokens": 640802794.0, + "step": 24765 + }, + { + "epoch": 2.7197452229299364, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4252521991729736, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.7021526098251343, + "num_tokens": 640828970.0, + "step": 24766 + }, + { + "epoch": 2.71985504063255, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3257577419281006, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7200873494148254, + "num_tokens": 640856830.0, + "step": 24767 + }, + { + "epoch": 2.719964858335164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.220245838165283, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6932086944580078, + "num_tokens": 640888052.0, + "step": 24768 + }, + { + "epoch": 2.720074676037777, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4857077598571777, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.726521372795105, + "num_tokens": 640912422.0, + "step": 24769 + }, + { + "epoch": 2.720184493740391, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.675860643386841, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.716048002243042, + "num_tokens": 640933340.0, + "step": 24770 + }, + { + "epoch": 2.7202943114430047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.694814920425415, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7363481521606445, + "num_tokens": 640953858.0, + "step": 24771 + }, + { + "epoch": 2.720404129145618, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4790549278259277, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7164725065231323, + "num_tokens": 640979591.0, + "step": 24772 + }, + { + "epoch": 2.720513946848232, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.410430431365967, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7144538760185242, + "num_tokens": 641005981.0, + "step": 24773 + }, + { + "epoch": 2.7206237645508455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5286598205566406, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7293285131454468, + "num_tokens": 641030596.0, + "step": 24774 + }, + { + "epoch": 2.7207335822534593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3888931274414062, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.689134955406189, + "num_tokens": 641057299.0, + "step": 24775 + }, + { + "epoch": 2.720843399956073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4919846057891846, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7110918164253235, + "num_tokens": 641083248.0, + "step": 24776 + }, + { + "epoch": 2.7209532176586864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.451347589492798, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6873050928115845, + "num_tokens": 641110826.0, + "step": 24777 + }, + { + "epoch": 2.7210630353613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4683353900909424, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7309145927429199, + "num_tokens": 641138748.0, + "step": 24778 + }, + { + "epoch": 2.721172853063914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4425621032714844, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7018113732337952, + "num_tokens": 641167288.0, + "step": 24779 + }, + { + "epoch": 2.7212826707665276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3959922790527344, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7056846618652344, + "num_tokens": 641196385.0, + "step": 24780 + }, + { + "epoch": 2.7213924884691414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.507195234298706, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7083644270896912, + "num_tokens": 641222366.0, + "step": 24781 + }, + { + "epoch": 2.7215023061717547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.096792697906494, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7091420292854309, + "num_tokens": 641255955.0, + "step": 24782 + }, + { + "epoch": 2.7216121238743685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.48582124710083, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6997396945953369, + "num_tokens": 641280874.0, + "step": 24783 + }, + { + "epoch": 2.721721941576982, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6379659175872803, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7211965322494507, + "num_tokens": 641303592.0, + "step": 24784 + }, + { + "epoch": 2.721831759279596, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.0036797523498535, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6824324131011963, + "num_tokens": 641336893.0, + "step": 24785 + }, + { + "epoch": 2.7219415769822097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6486194133758545, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7216215133666992, + "num_tokens": 641359312.0, + "step": 24786 + }, + { + "epoch": 2.722051394684823, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.573741912841797, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7341165542602539, + "num_tokens": 641383693.0, + "step": 24787 + }, + { + "epoch": 2.722161212387437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.623386859893799, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7189435958862305, + "num_tokens": 641407736.0, + "step": 24788 + }, + { + "epoch": 2.7222710300900506, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.460836410522461, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6838861703872681, + "num_tokens": 641433221.0, + "step": 24789 + }, + { + "epoch": 2.7223808477926643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5990242958068848, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7111631631851196, + "num_tokens": 641456588.0, + "step": 24790 + }, + { + "epoch": 2.722490665495278, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.071781873703003, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6941251158714294, + "num_tokens": 641491270.0, + "step": 24791 + }, + { + "epoch": 2.7226004831978914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.292113780975342, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7051065564155579, + "num_tokens": 641524657.0, + "step": 24792 + }, + { + "epoch": 2.722710300900505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4551901817321777, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6920759677886963, + "num_tokens": 641551827.0, + "step": 24793 + }, + { + "epoch": 2.722820118603119, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8588364124298096, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7395561933517456, + "num_tokens": 641570975.0, + "step": 24794 + }, + { + "epoch": 2.722929936305732, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.704961061477661, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7200108766555786, + "num_tokens": 641591883.0, + "step": 24795 + }, + { + "epoch": 2.7230397540083464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.634445905685425, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7137724161148071, + "num_tokens": 641615483.0, + "step": 24796 + }, + { + "epoch": 2.7231495717109597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417447566986084, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7307142019271851, + "num_tokens": 641640463.0, + "step": 24797 + }, + { + "epoch": 2.7232593894135735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3569040298461914, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7036846876144409, + "num_tokens": 641669341.0, + "step": 24798 + }, + { + "epoch": 2.7233692071161872, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.489579439163208, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6969254612922668, + "num_tokens": 641695953.0, + "step": 24799 + }, + { + "epoch": 2.7234790248188006, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4807491302490234, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7101649045944214, + "num_tokens": 641720964.0, + "step": 24800 + }, + { + "epoch": 2.7235888425214143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1577858924865723, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.7025105357170105, + "num_tokens": 641752724.0, + "step": 24801 + }, + { + "epoch": 2.723698660224028, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7198288440704346, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7303839325904846, + "num_tokens": 641774184.0, + "step": 24802 + }, + { + "epoch": 2.723808477926642, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2829785346984863, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7012386322021484, + "num_tokens": 641802550.0, + "step": 24803 + }, + { + "epoch": 2.7239182956292556, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.738171100616455, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7366455793380737, + "num_tokens": 641822945.0, + "step": 24804 + }, + { + "epoch": 2.724028113331869, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4157960414886475, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7288376092910767, + "num_tokens": 641848120.0, + "step": 24805 + }, + { + "epoch": 2.7241379310344827, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3054563999176025, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7319327592849731, + "num_tokens": 641874104.0, + "step": 24806 + }, + { + "epoch": 2.7242477487370964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5053722858428955, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7242507338523865, + "num_tokens": 641898951.0, + "step": 24807 + }, + { + "epoch": 2.72435756643971, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5642335414886475, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7324826717376709, + "num_tokens": 641921362.0, + "step": 24808 + }, + { + "epoch": 2.724467384142324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.108325481414795, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7253472805023193, + "num_tokens": 641938341.0, + "step": 24809 + }, + { + "epoch": 2.7245772018449372, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1999645233154297, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6928176879882812, + "num_tokens": 641970748.0, + "step": 24810 + }, + { + "epoch": 2.724687019547551, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3500876426696777, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6947852969169617, + "num_tokens": 642001075.0, + "step": 24811 + }, + { + "epoch": 2.7247968372501647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3555495738983154, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7353020906448364, + "num_tokens": 642026261.0, + "step": 24812 + }, + { + "epoch": 2.7249066549527785, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.0936598777771, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7026611566543579, + "num_tokens": 642055981.0, + "step": 24813 + }, + { + "epoch": 2.7250164726553923, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4352974891662598, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6922953724861145, + "num_tokens": 642082195.0, + "step": 24814 + }, + { + "epoch": 2.7251262903580056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2583417892456055, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7118467688560486, + "num_tokens": 642114405.0, + "step": 24815 + }, + { + "epoch": 2.7252361080606193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5097384452819824, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7017512917518616, + "num_tokens": 642139251.0, + "step": 24816 + }, + { + "epoch": 2.725345925763233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5866880416870117, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7217589616775513, + "num_tokens": 642163813.0, + "step": 24817 + }, + { + "epoch": 2.725455743465847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.34842848777771, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7219150066375732, + "num_tokens": 642189645.0, + "step": 24818 + }, + { + "epoch": 2.7255655611684606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265117883682251, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7154438495635986, + "num_tokens": 642219610.0, + "step": 24819 + }, + { + "epoch": 2.725675378871074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7246768474578857, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.720409631729126, + "num_tokens": 642240897.0, + "step": 24820 + }, + { + "epoch": 2.7257851965736877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.439290761947632, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7071166038513184, + "num_tokens": 642270152.0, + "step": 24821 + }, + { + "epoch": 2.7258950142763014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1946816444396973, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7152289152145386, + "num_tokens": 642303524.0, + "step": 24822 + }, + { + "epoch": 2.7260048319789147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.619739294052124, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7096816897392273, + "num_tokens": 642326904.0, + "step": 24823 + }, + { + "epoch": 2.7261146496815285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.45955491065979, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7135779857635498, + "num_tokens": 642352381.0, + "step": 24824 + }, + { + "epoch": 2.7262244673841423, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3949782848358154, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7118536829948425, + "num_tokens": 642378034.0, + "step": 24825 + }, + { + "epoch": 2.726334285086756, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4654769897460938, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7216529846191406, + "num_tokens": 642403044.0, + "step": 24826 + }, + { + "epoch": 2.7264441027893698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6272501945495605, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.710238516330719, + "num_tokens": 642426545.0, + "step": 24827 + }, + { + "epoch": 2.726553920491983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3547301292419434, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7382014989852905, + "num_tokens": 642452117.0, + "step": 24828 + }, + { + "epoch": 2.726663738194597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.482710361480713, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6986078023910522, + "num_tokens": 642478164.0, + "step": 24829 + }, + { + "epoch": 2.7267735558972106, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265625238418579, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6908983588218689, + "num_tokens": 642506938.0, + "step": 24830 + }, + { + "epoch": 2.7268833735998244, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1631057262420654, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6973066329956055, + "num_tokens": 642542506.0, + "step": 24831 + }, + { + "epoch": 2.726993191302438, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6231696605682373, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7208641171455383, + "num_tokens": 642563542.0, + "step": 24832 + }, + { + "epoch": 2.7271030090050514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1786389350891113, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7112115025520325, + "num_tokens": 642595227.0, + "step": 24833 + }, + { + "epoch": 2.727212826707665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9246065616607666, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7315402030944824, + "num_tokens": 642613641.0, + "step": 24834 + }, + { + "epoch": 2.727322644410279, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3911263942718506, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7092709541320801, + "num_tokens": 642640041.0, + "step": 24835 + }, + { + "epoch": 2.7274324621128927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7249183654785156, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7450536489486694, + "num_tokens": 642658862.0, + "step": 24836 + }, + { + "epoch": 2.7275422798155065, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5380098819732666, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7170537710189819, + "num_tokens": 642683284.0, + "step": 24837 + }, + { + "epoch": 2.7276520975181198, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3148276805877686, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7270023226737976, + "num_tokens": 642711859.0, + "step": 24838 + }, + { + "epoch": 2.7277619152207335, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.061163902282715, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7012166976928711, + "num_tokens": 642749230.0, + "step": 24839 + }, + { + "epoch": 2.7278717329233473, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3882789611816406, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6998462677001953, + "num_tokens": 642775552.0, + "step": 24840 + }, + { + "epoch": 2.727981550625961, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6102616786956787, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7360928058624268, + "num_tokens": 642797175.0, + "step": 24841 + }, + { + "epoch": 2.728091368328575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419240713119507, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7101988792419434, + "num_tokens": 642823312.0, + "step": 24842 + }, + { + "epoch": 2.728201186031188, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3176023960113525, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7088382840156555, + "num_tokens": 642851144.0, + "step": 24843 + }, + { + "epoch": 2.728311003733802, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3569984436035156, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.722042441368103, + "num_tokens": 642878449.0, + "step": 24844 + }, + { + "epoch": 2.7284208214364156, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5477306842803955, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7368930578231812, + "num_tokens": 642900043.0, + "step": 24845 + }, + { + "epoch": 2.728530639139029, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.18772292137146, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6902481317520142, + "num_tokens": 642933975.0, + "step": 24846 + }, + { + "epoch": 2.728640456841643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9180448055267334, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7195305824279785, + "num_tokens": 642957037.0, + "step": 24847 + }, + { + "epoch": 2.7287502745442564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.353503704071045, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7205532789230347, + "num_tokens": 642983748.0, + "step": 24848 + }, + { + "epoch": 2.72886009224687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.145224094390869, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7146230936050415, + "num_tokens": 643015345.0, + "step": 24849 + }, + { + "epoch": 2.728969909949484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.647160291671753, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7122432589530945, + "num_tokens": 643037919.0, + "step": 24850 + }, + { + "epoch": 2.7290797276520973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.230944871902466, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7287943363189697, + "num_tokens": 643066006.0, + "step": 24851 + }, + { + "epoch": 2.729189545354711, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.302004337310791, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7330042719841003, + "num_tokens": 643092663.0, + "step": 24852 + }, + { + "epoch": 2.729299363057325, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2770910263061523, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7128823399543762, + "num_tokens": 643122114.0, + "step": 24853 + }, + { + "epoch": 2.7294091807599385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.632821559906006, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7216222286224365, + "num_tokens": 643143373.0, + "step": 24854 + }, + { + "epoch": 2.7295189984625523, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5162928104400635, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7308818101882935, + "num_tokens": 643166868.0, + "step": 24855 + }, + { + "epoch": 2.7296288161651656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5067622661590576, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7049307227134705, + "num_tokens": 643195898.0, + "step": 24856 + }, + { + "epoch": 2.7297386338677794, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.272480010986328, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6907689571380615, + "num_tokens": 643228559.0, + "step": 24857 + }, + { + "epoch": 2.729848451570393, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3629915714263916, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7328280806541443, + "num_tokens": 643254623.0, + "step": 24858 + }, + { + "epoch": 2.729958269273007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4671790599823, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7311779856681824, + "num_tokens": 643280531.0, + "step": 24859 + }, + { + "epoch": 2.7300680869756206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.686931610107422, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7223899364471436, + "num_tokens": 643300152.0, + "step": 24860 + }, + { + "epoch": 2.730177904678234, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3260984420776367, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7161462306976318, + "num_tokens": 643327754.0, + "step": 24861 + }, + { + "epoch": 2.7302877223808477, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6051597595214844, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7236562967300415, + "num_tokens": 643352108.0, + "step": 24862 + }, + { + "epoch": 2.7303975400834615, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2871665954589844, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7138256430625916, + "num_tokens": 643380193.0, + "step": 24863 + }, + { + "epoch": 2.7305073577860752, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3165433406829834, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6958856582641602, + "num_tokens": 643409522.0, + "step": 24864 + }, + { + "epoch": 2.730617175488689, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.292787551879883, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7184970378875732, + "num_tokens": 643437342.0, + "step": 24865 + }, + { + "epoch": 2.7307269931913023, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4474053382873535, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7116221189498901, + "num_tokens": 643463951.0, + "step": 24866 + }, + { + "epoch": 2.730836810893916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.545248508453369, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7347767353057861, + "num_tokens": 643487187.0, + "step": 24867 + }, + { + "epoch": 2.73094662859653, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6254677772521973, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7050612568855286, + "num_tokens": 643508957.0, + "step": 24868 + }, + { + "epoch": 2.7310564462991436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4306397438049316, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7134947180747986, + "num_tokens": 643534542.0, + "step": 24869 + }, + { + "epoch": 2.7311662640017573, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.089469909667969, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7177467346191406, + "num_tokens": 643560555.0, + "step": 24870 + }, + { + "epoch": 2.7312760817043706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.140244960784912, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7081928253173828, + "num_tokens": 643594435.0, + "step": 24871 + }, + { + "epoch": 2.7313858994069844, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.311511516571045, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6946878433227539, + "num_tokens": 643623464.0, + "step": 24872 + }, + { + "epoch": 2.731495717109598, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4094691276550293, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7093650102615356, + "num_tokens": 643651590.0, + "step": 24873 + }, + { + "epoch": 2.7316055348122115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5233728885650635, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7152485251426697, + "num_tokens": 643679139.0, + "step": 24874 + }, + { + "epoch": 2.7317153525148252, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.540588855743408, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7061295509338379, + "num_tokens": 643704449.0, + "step": 24875 + }, + { + "epoch": 2.731825170217439, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.597573757171631, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7205461859703064, + "num_tokens": 643727376.0, + "step": 24876 + }, + { + "epoch": 2.7319349879200527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5562644004821777, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7092640399932861, + "num_tokens": 643751934.0, + "step": 24877 + }, + { + "epoch": 2.7320448056226665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4131407737731934, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7143203020095825, + "num_tokens": 643778941.0, + "step": 24878 + }, + { + "epoch": 2.73215462332528, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7023258209228516, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7167822122573853, + "num_tokens": 643805336.0, + "step": 24879 + }, + { + "epoch": 2.7322644410278936, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2762386798858643, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6881263852119446, + "num_tokens": 643834946.0, + "step": 24880 + }, + { + "epoch": 2.7323742587305073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.251805543899536, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7073723077774048, + "num_tokens": 643864791.0, + "step": 24881 + }, + { + "epoch": 2.732484076433121, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.275681257247925, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.7014968991279602, + "num_tokens": 643896644.0, + "step": 24882 + }, + { + "epoch": 2.732593894135735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3944783210754395, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7254094481468201, + "num_tokens": 643924896.0, + "step": 24883 + }, + { + "epoch": 2.732703711838348, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.25093936920166, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.724632978439331, + "num_tokens": 643955376.0, + "step": 24884 + }, + { + "epoch": 2.732813529540962, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2185451984405518, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7301275730133057, + "num_tokens": 643984504.0, + "step": 24885 + }, + { + "epoch": 2.7329233472435757, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.503553628921509, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7113139033317566, + "num_tokens": 644010345.0, + "step": 24886 + }, + { + "epoch": 2.7330331649461894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5145108699798584, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7161768674850464, + "num_tokens": 644034153.0, + "step": 24887 + }, + { + "epoch": 2.733142982648803, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3901634216308594, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7178527116775513, + "num_tokens": 644059787.0, + "step": 24888 + }, + { + "epoch": 2.7332528003514165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.57210111618042, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7286205291748047, + "num_tokens": 644082650.0, + "step": 24889 + }, + { + "epoch": 2.7333626180540302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.449563503265381, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7319065928459167, + "num_tokens": 644106614.0, + "step": 24890 + }, + { + "epoch": 2.733472435756644, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5338780879974365, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7046905755996704, + "num_tokens": 644130761.0, + "step": 24891 + }, + { + "epoch": 2.7335822534592578, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5328354835510254, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7295257449150085, + "num_tokens": 644154838.0, + "step": 24892 + }, + { + "epoch": 2.7336920711618715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.640684127807617, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7257490754127502, + "num_tokens": 644179150.0, + "step": 24893 + }, + { + "epoch": 2.733801888864485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2996950149536133, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7236413955688477, + "num_tokens": 644206990.0, + "step": 24894 + }, + { + "epoch": 2.7339117065670986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.601320505142212, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6954408288002014, + "num_tokens": 644233162.0, + "step": 24895 + }, + { + "epoch": 2.7340215242697123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5531210899353027, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6934930086135864, + "num_tokens": 644256968.0, + "step": 24896 + }, + { + "epoch": 2.734131341972326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3241991996765137, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7101264595985413, + "num_tokens": 644284581.0, + "step": 24897 + }, + { + "epoch": 2.73424115967494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.55153226852417, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7162598371505737, + "num_tokens": 644308226.0, + "step": 24898 + }, + { + "epoch": 2.734350977377553, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1913135051727295, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6990188956260681, + "num_tokens": 644340434.0, + "step": 24899 + }, + { + "epoch": 2.734460795080167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2459211349487305, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7170735597610474, + "num_tokens": 644371815.0, + "step": 24900 + }, + { + "epoch": 2.7345706127827807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4022326469421387, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7109765410423279, + "num_tokens": 644396875.0, + "step": 24901 + }, + { + "epoch": 2.734680430485394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.445378303527832, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7112717032432556, + "num_tokens": 644423665.0, + "step": 24902 + }, + { + "epoch": 2.7347902481880078, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2915732860565186, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7007901668548584, + "num_tokens": 644455080.0, + "step": 24903 + }, + { + "epoch": 2.7349000658906215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4901838302612305, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7280983924865723, + "num_tokens": 644478301.0, + "step": 24904 + }, + { + "epoch": 2.7350098835932353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.578695058822632, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7205356359481812, + "num_tokens": 644500203.0, + "step": 24905 + }, + { + "epoch": 2.735119701295849, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3357090950012207, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7056314945220947, + "num_tokens": 644528204.0, + "step": 24906 + }, + { + "epoch": 2.7352295189984623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7611820697784424, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7345719337463379, + "num_tokens": 644547240.0, + "step": 24907 + }, + { + "epoch": 2.735339336701076, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5272634029388428, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7223212718963623, + "num_tokens": 644570855.0, + "step": 24908 + }, + { + "epoch": 2.73544915440369, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5394179821014404, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.697352409362793, + "num_tokens": 644595127.0, + "step": 24909 + }, + { + "epoch": 2.7355589721063036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3370344638824463, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7184116244316101, + "num_tokens": 644625702.0, + "step": 24910 + }, + { + "epoch": 2.7356687898089174, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6669018268585205, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.699303150177002, + "num_tokens": 644649169.0, + "step": 24911 + }, + { + "epoch": 2.7357786075115307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7180140018463135, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7297438979148865, + "num_tokens": 644670190.0, + "step": 24912 + }, + { + "epoch": 2.7358884252141444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5005135536193848, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7153645753860474, + "num_tokens": 644694420.0, + "step": 24913 + }, + { + "epoch": 2.735998242916758, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6407461166381836, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7063513994216919, + "num_tokens": 644717165.0, + "step": 24914 + }, + { + "epoch": 2.736108060619372, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.379216432571411, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7233532667160034, + "num_tokens": 644745186.0, + "step": 24915 + }, + { + "epoch": 2.7362178783219857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.318828582763672, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.702418863773346, + "num_tokens": 644773772.0, + "step": 24916 + }, + { + "epoch": 2.736327696024599, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.45910382270813, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7008326649665833, + "num_tokens": 644800001.0, + "step": 24917 + }, + { + "epoch": 2.736437513727213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7187891006469727, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7064760327339172, + "num_tokens": 644825060.0, + "step": 24918 + }, + { + "epoch": 2.7365473314298265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.60943341255188, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7176331281661987, + "num_tokens": 644848268.0, + "step": 24919 + }, + { + "epoch": 2.7366571491324403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7359466552734375, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7229433059692383, + "num_tokens": 644868479.0, + "step": 24920 + }, + { + "epoch": 2.736766966835054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.9749150276184082, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7114080190658569, + "num_tokens": 644904886.0, + "step": 24921 + }, + { + "epoch": 2.7368767845376674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0735371112823486, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6963649988174438, + "num_tokens": 644942507.0, + "step": 24922 + }, + { + "epoch": 2.736986602240281, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7526538372039795, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7416972517967224, + "num_tokens": 644960996.0, + "step": 24923 + }, + { + "epoch": 2.737096419942895, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3093974590301514, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7165135145187378, + "num_tokens": 644991209.0, + "step": 24924 + }, + { + "epoch": 2.737206237645508, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3427276611328125, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6968288421630859, + "num_tokens": 645021253.0, + "step": 24925 + }, + { + "epoch": 2.7373160553481224, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.9113659858703613, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.732421338558197, + "num_tokens": 645045743.0, + "step": 24926 + }, + { + "epoch": 2.7374258730507357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4295854568481445, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.713460385799408, + "num_tokens": 645070358.0, + "step": 24927 + }, + { + "epoch": 2.7375356907533495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4863903522491455, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7327858805656433, + "num_tokens": 645093569.0, + "step": 24928 + }, + { + "epoch": 2.737645508455963, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265427350997925, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7037354707717896, + "num_tokens": 645123189.0, + "step": 24929 + }, + { + "epoch": 2.7377553261585765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4534482955932617, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7239359021186829, + "num_tokens": 645148340.0, + "step": 24930 + }, + { + "epoch": 2.7378651438611903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7350502014160156, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.734285295009613, + "num_tokens": 645170493.0, + "step": 24931 + }, + { + "epoch": 2.737974961563804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4202303886413574, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7047823667526245, + "num_tokens": 645196556.0, + "step": 24932 + }, + { + "epoch": 2.738084779266418, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4000535011291504, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.711329996585846, + "num_tokens": 645222041.0, + "step": 24933 + }, + { + "epoch": 2.7381945969690316, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513728141784668, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7326008081436157, + "num_tokens": 645243118.0, + "step": 24934 + }, + { + "epoch": 2.738304414671645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9789626598358154, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.715095043182373, + "num_tokens": 645261831.0, + "step": 24935 + }, + { + "epoch": 2.7384142323742586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376995801925659, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6971395015716553, + "num_tokens": 645289555.0, + "step": 24936 + }, + { + "epoch": 2.7385240500768724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6893112659454346, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7240936756134033, + "num_tokens": 645311314.0, + "step": 24937 + }, + { + "epoch": 2.738633867779486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.355633497238159, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6970523595809937, + "num_tokens": 645341909.0, + "step": 24938 + }, + { + "epoch": 2.7387436854821, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.647867441177368, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6986949443817139, + "num_tokens": 645366844.0, + "step": 24939 + }, + { + "epoch": 2.738853503184713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7133584022521973, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.709597647190094, + "num_tokens": 645389199.0, + "step": 24940 + }, + { + "epoch": 2.738963320887327, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1864898204803467, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.709997832775116, + "num_tokens": 645423038.0, + "step": 24941 + }, + { + "epoch": 2.7390731385899407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.319291591644287, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7272524237632751, + "num_tokens": 645451731.0, + "step": 24942 + }, + { + "epoch": 2.7391829562925545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3530776500701904, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7133185863494873, + "num_tokens": 645478334.0, + "step": 24943 + }, + { + "epoch": 2.7392927739951682, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5709993839263916, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.709876298904419, + "num_tokens": 645502011.0, + "step": 24944 + }, + { + "epoch": 2.7394025916977816, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4817514419555664, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7156030535697937, + "num_tokens": 645526537.0, + "step": 24945 + }, + { + "epoch": 2.7395124094003953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1729869842529297, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7461222410202026, + "num_tokens": 645555576.0, + "step": 24946 + }, + { + "epoch": 2.739622227103009, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5716731548309326, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.720316469669342, + "num_tokens": 645579414.0, + "step": 24947 + }, + { + "epoch": 2.739732044805623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.713808298110962, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7258899211883545, + "num_tokens": 645600487.0, + "step": 24948 + }, + { + "epoch": 2.7398418625082366, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.361435651779175, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7093747854232788, + "num_tokens": 645627786.0, + "step": 24949 + }, + { + "epoch": 2.73995168021085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.731837749481201, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7266514897346497, + "num_tokens": 645650574.0, + "step": 24950 + }, + { + "epoch": 2.7400614979134637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2889225482940674, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6912678480148315, + "num_tokens": 645687087.0, + "step": 24951 + }, + { + "epoch": 2.7401713156160774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4760453701019287, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7186569571495056, + "num_tokens": 645710254.0, + "step": 24952 + }, + { + "epoch": 2.7402811333186907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2255561351776123, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7029725313186646, + "num_tokens": 645740072.0, + "step": 24953 + }, + { + "epoch": 2.7403909510213045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9378395080566406, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7245435118675232, + "num_tokens": 645758717.0, + "step": 24954 + }, + { + "epoch": 2.7405007687239182, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3939061164855957, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7007372379302979, + "num_tokens": 645783376.0, + "step": 24955 + }, + { + "epoch": 2.740610586426532, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417046308517456, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.7000275254249573, + "num_tokens": 645810540.0, + "step": 24956 + }, + { + "epoch": 2.7407204041291457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5881357192993164, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.714069128036499, + "num_tokens": 645836400.0, + "step": 24957 + }, + { + "epoch": 2.740830221831759, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2490663528442383, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6976013779640198, + "num_tokens": 645865217.0, + "step": 24958 + }, + { + "epoch": 2.740940039534373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2001380920410156, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.7011522650718689, + "num_tokens": 645897124.0, + "step": 24959 + }, + { + "epoch": 2.7410498572369866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.520174741744995, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7416011095046997, + "num_tokens": 645919008.0, + "step": 24960 + }, + { + "epoch": 2.7411596749396003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4475624561309814, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7099996209144592, + "num_tokens": 645944119.0, + "step": 24961 + }, + { + "epoch": 2.741269492642214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2324109077453613, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6948333978652954, + "num_tokens": 645976846.0, + "step": 24962 + }, + { + "epoch": 2.7413793103448274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5699048042297363, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7204234600067139, + "num_tokens": 646000486.0, + "step": 24963 + }, + { + "epoch": 2.741489128047441, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4641079902648926, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6944311857223511, + "num_tokens": 646028394.0, + "step": 24964 + }, + { + "epoch": 2.741598945750055, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.272739887237549, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.698662281036377, + "num_tokens": 646058019.0, + "step": 24965 + }, + { + "epoch": 2.7417087634526687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4552907943725586, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.7044639587402344, + "num_tokens": 646082838.0, + "step": 24966 + }, + { + "epoch": 2.7418185811552824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2197837829589844, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7219687700271606, + "num_tokens": 646112665.0, + "step": 24967 + }, + { + "epoch": 2.7419283988578957, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1398396492004395, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7013731002807617, + "num_tokens": 646143820.0, + "step": 24968 + }, + { + "epoch": 2.7420382165605095, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.412163257598877, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7021826505661011, + "num_tokens": 646170636.0, + "step": 24969 + }, + { + "epoch": 2.7421480342631233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7334258556365967, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7036197185516357, + "num_tokens": 646192243.0, + "step": 24970 + }, + { + "epoch": 2.742257851965737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2727317810058594, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6937182545661926, + "num_tokens": 646224106.0, + "step": 24971 + }, + { + "epoch": 2.7423676696683508, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.313133478164673, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7029057145118713, + "num_tokens": 646251003.0, + "step": 24972 + }, + { + "epoch": 2.742477487370964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.368443727493286, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6959887742996216, + "num_tokens": 646278161.0, + "step": 24973 + }, + { + "epoch": 2.742587305073578, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.550966262817383, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7437134981155396, + "num_tokens": 646300023.0, + "step": 24974 + }, + { + "epoch": 2.7426971227761916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1438138484954834, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7023587226867676, + "num_tokens": 646330269.0, + "step": 24975 + }, + { + "epoch": 2.742806940478805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3420250415802, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7019445300102234, + "num_tokens": 646359177.0, + "step": 24976 + }, + { + "epoch": 2.742916758181419, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.814413547515869, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7189801931381226, + "num_tokens": 646378780.0, + "step": 24977 + }, + { + "epoch": 2.7430265758840324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3501830101013184, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7229204177856445, + "num_tokens": 646408380.0, + "step": 24978 + }, + { + "epoch": 2.743136393586646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3527474403381348, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.714225709438324, + "num_tokens": 646436526.0, + "step": 24979 + }, + { + "epoch": 2.74324621128926, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6454477310180664, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7322183847427368, + "num_tokens": 646457486.0, + "step": 24980 + }, + { + "epoch": 2.7433560289918733, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6576380729675293, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7185021042823792, + "num_tokens": 646478215.0, + "step": 24981 + }, + { + "epoch": 2.743465846694487, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4930930137634277, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7131003141403198, + "num_tokens": 646502529.0, + "step": 24982 + }, + { + "epoch": 2.7435756643971008, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277316093444824, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.7058275938034058, + "num_tokens": 646532780.0, + "step": 24983 + }, + { + "epoch": 2.7436854820997145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4913041591644287, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7289227843284607, + "num_tokens": 646555900.0, + "step": 24984 + }, + { + "epoch": 2.7437952998023283, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.9953871965408325, + "learning_rate": 1e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6880831718444824, + "num_tokens": 646590148.0, + "step": 24985 + }, + { + "epoch": 2.7439051175049416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414612293243408, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7307161688804626, + "num_tokens": 646615531.0, + "step": 24986 + }, + { + "epoch": 2.7440149352075554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5056114196777344, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7284337282180786, + "num_tokens": 646640800.0, + "step": 24987 + }, + { + "epoch": 2.744124752910169, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.625659704208374, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7161129713058472, + "num_tokens": 646664533.0, + "step": 24988 + }, + { + "epoch": 2.744234570612783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.460979700088501, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7334681749343872, + "num_tokens": 646687570.0, + "step": 24989 + }, + { + "epoch": 2.7443443883153966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.438431978225708, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6989659667015076, + "num_tokens": 646713050.0, + "step": 24990 + }, + { + "epoch": 2.74445420601801, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5075459480285645, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7299444675445557, + "num_tokens": 646736710.0, + "step": 24991 + }, + { + "epoch": 2.7445640237206237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.983806610107422, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7184265851974487, + "num_tokens": 646760872.0, + "step": 24992 + }, + { + "epoch": 2.7446738414232374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6229097843170166, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7350101470947266, + "num_tokens": 646783717.0, + "step": 24993 + }, + { + "epoch": 2.744783659125851, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3852200508117676, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7057835459709167, + "num_tokens": 646810412.0, + "step": 24994 + }, + { + "epoch": 2.744893476828465, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6587142944335938, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7071174383163452, + "num_tokens": 646833771.0, + "step": 24995 + }, + { + "epoch": 2.7450032945310783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4823031425476074, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7190052270889282, + "num_tokens": 646858231.0, + "step": 24996 + }, + { + "epoch": 2.745113112233692, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8237104415893555, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.713123083114624, + "num_tokens": 646879355.0, + "step": 24997 + }, + { + "epoch": 2.745222929936306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3120603561401367, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6841249465942383, + "num_tokens": 646910891.0, + "step": 24998 + }, + { + "epoch": 2.7453327476389195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.433781385421753, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7250279188156128, + "num_tokens": 646938572.0, + "step": 24999 + }, + { + "epoch": 2.7454425653415333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.447002410888672, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.724961519241333, + "num_tokens": 646964803.0, + "step": 25000 + }, + { + "epoch": 2.7455523830441466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1501314640045166, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.709160566329956, + "num_tokens": 646997019.0, + "step": 25001 + }, + { + "epoch": 2.7456622007467604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4778389930725098, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7252591848373413, + "num_tokens": 647020212.0, + "step": 25002 + }, + { + "epoch": 2.745772018449374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2931928634643555, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7032923698425293, + "num_tokens": 647049845.0, + "step": 25003 + }, + { + "epoch": 2.7458818361519874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3467979431152344, + "learning_rate": 1e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6967856287956238, + "num_tokens": 647078205.0, + "step": 25004 + }, + { + "epoch": 2.745991653854601, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0884528160095215, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7001749873161316, + "num_tokens": 647113634.0, + "step": 25005 + }, + { + "epoch": 2.746101471557215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6010165214538574, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7106581926345825, + "num_tokens": 647136875.0, + "step": 25006 + }, + { + "epoch": 2.7462112892598287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3691155910491943, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7181760668754578, + "num_tokens": 647162932.0, + "step": 25007 + }, + { + "epoch": 2.7463211069624425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4045002460479736, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6981011629104614, + "num_tokens": 647190846.0, + "step": 25008 + }, + { + "epoch": 2.746430924665056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324120283126831, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7114499807357788, + "num_tokens": 647219098.0, + "step": 25009 + }, + { + "epoch": 2.7465407423676695, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419745683670044, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7113631963729858, + "num_tokens": 647243920.0, + "step": 25010 + }, + { + "epoch": 2.7466505600702833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4516210556030273, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7143000364303589, + "num_tokens": 647268324.0, + "step": 25011 + }, + { + "epoch": 2.746760377772897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6252269744873047, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7191574573516846, + "num_tokens": 647291575.0, + "step": 25012 + }, + { + "epoch": 2.746870195475511, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6086859703063965, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7396935820579529, + "num_tokens": 647314862.0, + "step": 25013 + }, + { + "epoch": 2.746980013178124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5653340816497803, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7082980871200562, + "num_tokens": 647338860.0, + "step": 25014 + }, + { + "epoch": 2.747089830880738, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337131977081299, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.712121307849884, + "num_tokens": 647367882.0, + "step": 25015 + }, + { + "epoch": 2.7471996485833516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.366476535797119, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6938228011131287, + "num_tokens": 647394841.0, + "step": 25016 + }, + { + "epoch": 2.7473094662859654, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4051716327667236, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7182876467704773, + "num_tokens": 647419836.0, + "step": 25017 + }, + { + "epoch": 2.747419283988579, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4980225563049316, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7199363112449646, + "num_tokens": 647445445.0, + "step": 25018 + }, + { + "epoch": 2.7475291016911925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.167208194732666, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7225016951560974, + "num_tokens": 647476707.0, + "step": 25019 + }, + { + "epoch": 2.7476389193938062, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.062699556350708, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7257770299911499, + "num_tokens": 647493810.0, + "step": 25020 + }, + { + "epoch": 2.74774873709642, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3420867919921875, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.725646436214447, + "num_tokens": 647521427.0, + "step": 25021 + }, + { + "epoch": 2.7478585547990337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3933253288269043, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7098275423049927, + "num_tokens": 647549213.0, + "step": 25022 + }, + { + "epoch": 2.7479683725016475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2508182525634766, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.675932765007019, + "num_tokens": 647581008.0, + "step": 25023 + }, + { + "epoch": 2.748078190204261, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5170488357543945, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7245092391967773, + "num_tokens": 647605790.0, + "step": 25024 + }, + { + "epoch": 2.7481880079068746, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3022971153259277, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7157015800476074, + "num_tokens": 647631816.0, + "step": 25025 + }, + { + "epoch": 2.7482978256094883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.530184745788574, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7288676500320435, + "num_tokens": 647656111.0, + "step": 25026 + }, + { + "epoch": 2.7484076433121016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5991687774658203, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7180542945861816, + "num_tokens": 647677258.0, + "step": 25027 + }, + { + "epoch": 2.748517461014716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.06127667427063, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7161604166030884, + "num_tokens": 647697016.0, + "step": 25028 + }, + { + "epoch": 2.748627278717329, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.427598237991333, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7095250487327576, + "num_tokens": 647723095.0, + "step": 25029 + }, + { + "epoch": 2.748737096419943, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.583186149597168, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7009932994842529, + "num_tokens": 647747672.0, + "step": 25030 + }, + { + "epoch": 2.7488469141225567, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4824328422546387, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7134690284729004, + "num_tokens": 647774007.0, + "step": 25031 + }, + { + "epoch": 2.74895673182517, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4508590698242188, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.704795241355896, + "num_tokens": 647804272.0, + "step": 25032 + }, + { + "epoch": 2.7490665495277837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.618553638458252, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7243542671203613, + "num_tokens": 647827428.0, + "step": 25033 + }, + { + "epoch": 2.7491763672303975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.610812187194824, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7133766412734985, + "num_tokens": 647849150.0, + "step": 25034 + }, + { + "epoch": 2.7492861849330112, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4419045448303223, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7295408248901367, + "num_tokens": 647876113.0, + "step": 25035 + }, + { + "epoch": 2.749396002635625, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.30602765083313, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7052465677261353, + "num_tokens": 647903846.0, + "step": 25036 + }, + { + "epoch": 2.7495058203382383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1451680660247803, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6900241374969482, + "num_tokens": 647938368.0, + "step": 25037 + }, + { + "epoch": 2.749615638040852, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.207709789276123, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7051641941070557, + "num_tokens": 647970228.0, + "step": 25038 + }, + { + "epoch": 2.749725455743466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3168134689331055, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7032535672187805, + "num_tokens": 647999975.0, + "step": 25039 + }, + { + "epoch": 2.7498352734460796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4266669750213623, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7162747383117676, + "num_tokens": 648025289.0, + "step": 25040 + }, + { + "epoch": 2.7499450911486933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.253474235534668, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7173788547515869, + "num_tokens": 648056141.0, + "step": 25041 + }, + { + "epoch": 2.7500549088513067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.193995714187622, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7165710926055908, + "num_tokens": 648087984.0, + "step": 25042 + }, + { + "epoch": 2.7501647265539204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.251455307006836, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7286646366119385, + "num_tokens": 648118362.0, + "step": 25043 + }, + { + "epoch": 2.750274544256534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3883159160614014, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7392094135284424, + "num_tokens": 648142816.0, + "step": 25044 + }, + { + "epoch": 2.750384361959148, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2512967586517334, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7358639240264893, + "num_tokens": 648170513.0, + "step": 25045 + }, + { + "epoch": 2.7504941796617617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.397596597671509, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7220718860626221, + "num_tokens": 648195166.0, + "step": 25046 + }, + { + "epoch": 2.750603997364375, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.299379825592041, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7108928561210632, + "num_tokens": 648222767.0, + "step": 25047 + }, + { + "epoch": 2.7507138150669888, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0470476150512695, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7125252485275269, + "num_tokens": 648255722.0, + "step": 25048 + }, + { + "epoch": 2.7508236327696025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.424221992492676, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7313203811645508, + "num_tokens": 648280043.0, + "step": 25049 + }, + { + "epoch": 2.7509334504722163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3284475803375244, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6897245645523071, + "num_tokens": 648307814.0, + "step": 25050 + }, + { + "epoch": 2.75104326817483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3593082427978516, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7007113695144653, + "num_tokens": 648336217.0, + "step": 25051 + }, + { + "epoch": 2.7511530858774433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3114335536956787, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7312224507331848, + "num_tokens": 648362643.0, + "step": 25052 + }, + { + "epoch": 2.751262903580057, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3631269931793213, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.7017085552215576, + "num_tokens": 648389974.0, + "step": 25053 + }, + { + "epoch": 2.751372721282671, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2801620960235596, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6949760913848877, + "num_tokens": 648418638.0, + "step": 25054 + }, + { + "epoch": 2.751482538985284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6212306022644043, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7174140214920044, + "num_tokens": 648440188.0, + "step": 25055 + }, + { + "epoch": 2.7515923566878984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539675712585449, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7363370656967163, + "num_tokens": 648463041.0, + "step": 25056 + }, + { + "epoch": 2.7517021743905117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4585752487182617, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7245724201202393, + "num_tokens": 648489706.0, + "step": 25057 + }, + { + "epoch": 2.7518119920931254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.604292869567871, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7086541056632996, + "num_tokens": 648513222.0, + "step": 25058 + }, + { + "epoch": 2.751921809795739, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.49695086479187, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7264590859413147, + "num_tokens": 648538625.0, + "step": 25059 + }, + { + "epoch": 2.7520316274983525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.51137113571167, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7161725759506226, + "num_tokens": 648562289.0, + "step": 25060 + }, + { + "epoch": 2.7521414452009663, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.170717716217041, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6995524168014526, + "num_tokens": 648595217.0, + "step": 25061 + }, + { + "epoch": 2.75225126290358, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.424100637435913, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7148430943489075, + "num_tokens": 648620006.0, + "step": 25062 + }, + { + "epoch": 2.7523610806061938, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.311811923980713, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.6759944558143616, + "num_tokens": 648648475.0, + "step": 25063 + }, + { + "epoch": 2.7524708983088075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.508856773376465, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7171201705932617, + "num_tokens": 648672852.0, + "step": 25064 + }, + { + "epoch": 2.752580716011421, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4009957313537598, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7387927174568176, + "num_tokens": 648698109.0, + "step": 25065 + }, + { + "epoch": 2.7526905337140346, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376725673675537, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7081441879272461, + "num_tokens": 648725701.0, + "step": 25066 + }, + { + "epoch": 2.7528003514166484, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4615306854248047, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7411020994186401, + "num_tokens": 648749283.0, + "step": 25067 + }, + { + "epoch": 2.752910169119262, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5429036617279053, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7407751083374023, + "num_tokens": 648771991.0, + "step": 25068 + }, + { + "epoch": 2.753019986821876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.486957311630249, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7278269529342651, + "num_tokens": 648795677.0, + "step": 25069 + }, + { + "epoch": 2.753129804524489, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5403530597686768, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7169444561004639, + "num_tokens": 648818931.0, + "step": 25070 + }, + { + "epoch": 2.753239622227103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4403879642486572, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6993060111999512, + "num_tokens": 648847331.0, + "step": 25071 + }, + { + "epoch": 2.7533494399297167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7721328735351562, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7053455114364624, + "num_tokens": 648868817.0, + "step": 25072 + }, + { + "epoch": 2.7534592576323305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4123692512512207, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.7047716975212097, + "num_tokens": 648895614.0, + "step": 25073 + }, + { + "epoch": 2.753569075334944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4405925273895264, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7184404134750366, + "num_tokens": 648921781.0, + "step": 25074 + }, + { + "epoch": 2.7536788930375575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2534737586975098, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6950439214706421, + "num_tokens": 648952615.0, + "step": 25075 + }, + { + "epoch": 2.7537887107401713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.458660125732422, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7238661050796509, + "num_tokens": 648976681.0, + "step": 25076 + }, + { + "epoch": 2.753898528442785, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5249810218811035, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7229132652282715, + "num_tokens": 649001451.0, + "step": 25077 + }, + { + "epoch": 2.754008346145399, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.333871603012085, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6920276880264282, + "num_tokens": 649030563.0, + "step": 25078 + }, + { + "epoch": 2.7541181638480126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6421618461608887, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7092359066009521, + "num_tokens": 649052981.0, + "step": 25079 + }, + { + "epoch": 2.754227981550626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4256253242492676, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7164463400840759, + "num_tokens": 649081091.0, + "step": 25080 + }, + { + "epoch": 2.7543377992532396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4699203968048096, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7160027027130127, + "num_tokens": 649105045.0, + "step": 25081 + }, + { + "epoch": 2.7544476169558534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.315504789352417, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7244319915771484, + "num_tokens": 649133027.0, + "step": 25082 + }, + { + "epoch": 2.7545574346584667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.743502140045166, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7300771474838257, + "num_tokens": 649151761.0, + "step": 25083 + }, + { + "epoch": 2.7546672523610805, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.202899217605591, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7107194066047668, + "num_tokens": 649183703.0, + "step": 25084 + }, + { + "epoch": 2.754777070063694, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4300801753997803, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7064908742904663, + "num_tokens": 649209334.0, + "step": 25085 + }, + { + "epoch": 2.754886887766308, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3142013549804688, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7331002354621887, + "num_tokens": 649235028.0, + "step": 25086 + }, + { + "epoch": 2.7549967054689217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.541823148727417, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7155390381813049, + "num_tokens": 649258176.0, + "step": 25087 + }, + { + "epoch": 2.755106523171535, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5528292655944824, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7222039699554443, + "num_tokens": 649281539.0, + "step": 25088 + }, + { + "epoch": 2.755216340874149, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.231191873550415, + "learning_rate": 1e-06, + "loss": 1.0807, + "mean_token_accuracy": 0.6817178726196289, + "num_tokens": 649312403.0, + "step": 25089 + }, + { + "epoch": 2.7553261585767626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3549845218658447, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6935175657272339, + "num_tokens": 649341352.0, + "step": 25090 + }, + { + "epoch": 2.7554359762793763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.661405324935913, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7199150323867798, + "num_tokens": 649363178.0, + "step": 25091 + }, + { + "epoch": 2.75554579398199, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.870544195175171, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7333453893661499, + "num_tokens": 649381050.0, + "step": 25092 + }, + { + "epoch": 2.7556556116846034, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.521165132522583, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7217957973480225, + "num_tokens": 649405317.0, + "step": 25093 + }, + { + "epoch": 2.755765429387217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5024445056915283, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7222149968147278, + "num_tokens": 649430816.0, + "step": 25094 + }, + { + "epoch": 2.755875247089831, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5221214294433594, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7364098429679871, + "num_tokens": 649456318.0, + "step": 25095 + }, + { + "epoch": 2.7559850647924446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.590914726257324, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7145708799362183, + "num_tokens": 649480182.0, + "step": 25096 + }, + { + "epoch": 2.7560948824950584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6067256927490234, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7191011905670166, + "num_tokens": 649504133.0, + "step": 25097 + }, + { + "epoch": 2.7562047001976717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.322486639022827, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7085388898849487, + "num_tokens": 649533342.0, + "step": 25098 + }, + { + "epoch": 2.7563145179002855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1057121753692627, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6940004229545593, + "num_tokens": 649567140.0, + "step": 25099 + }, + { + "epoch": 2.7564243356028992, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3838560581207275, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7185827493667603, + "num_tokens": 649591643.0, + "step": 25100 + }, + { + "epoch": 2.756534153305513, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.177125930786133, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6956280469894409, + "num_tokens": 649624483.0, + "step": 25101 + }, + { + "epoch": 2.7566439710081267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.360752582550049, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7555055022239685, + "num_tokens": 649650361.0, + "step": 25102 + }, + { + "epoch": 2.75675378871074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.761523723602295, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7025465965270996, + "num_tokens": 649670929.0, + "step": 25103 + }, + { + "epoch": 2.756863606413354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1137442588806152, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7157575488090515, + "num_tokens": 649704762.0, + "step": 25104 + }, + { + "epoch": 2.7569734241159676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5559842586517334, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6945379972457886, + "num_tokens": 649728888.0, + "step": 25105 + }, + { + "epoch": 2.757083241818581, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3469700813293457, + "learning_rate": 1e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6850677728652954, + "num_tokens": 649756537.0, + "step": 25106 + }, + { + "epoch": 2.757193059521195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.40456223487854, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7134573459625244, + "num_tokens": 649784537.0, + "step": 25107 + }, + { + "epoch": 2.7573028772238084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3835554122924805, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6903774738311768, + "num_tokens": 649812101.0, + "step": 25108 + }, + { + "epoch": 2.757412694926422, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2128758430480957, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7260444164276123, + "num_tokens": 649840318.0, + "step": 25109 + }, + { + "epoch": 2.757522512629036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4931132793426514, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7256104946136475, + "num_tokens": 649865001.0, + "step": 25110 + }, + { + "epoch": 2.7576323303316492, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.490251302719116, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7313253879547119, + "num_tokens": 649889271.0, + "step": 25111 + }, + { + "epoch": 2.757742148034263, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3250625133514404, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7056385278701782, + "num_tokens": 649916816.0, + "step": 25112 + }, + { + "epoch": 2.7578519657368767, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.545639753341675, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6974371671676636, + "num_tokens": 649941801.0, + "step": 25113 + }, + { + "epoch": 2.7579617834394905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6146812438964844, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7152031660079956, + "num_tokens": 649964154.0, + "step": 25114 + }, + { + "epoch": 2.7580716011421043, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.252638816833496, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7053666710853577, + "num_tokens": 649994431.0, + "step": 25115 + }, + { + "epoch": 2.7581814188447176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3700356483459473, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7176039814949036, + "num_tokens": 650023704.0, + "step": 25116 + }, + { + "epoch": 2.7582912365473313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5996954441070557, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7116007208824158, + "num_tokens": 650046884.0, + "step": 25117 + }, + { + "epoch": 2.758401054249945, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3732359409332275, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7331351041793823, + "num_tokens": 650071453.0, + "step": 25118 + }, + { + "epoch": 2.758510871952559, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3340065479278564, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7292054891586304, + "num_tokens": 650096453.0, + "step": 25119 + }, + { + "epoch": 2.7586206896551726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5926766395568848, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7118827104568481, + "num_tokens": 650120391.0, + "step": 25120 + }, + { + "epoch": 2.758730507357786, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5963611602783203, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7051903009414673, + "num_tokens": 650144385.0, + "step": 25121 + }, + { + "epoch": 2.7588403250603997, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5488486289978027, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6979321837425232, + "num_tokens": 650169422.0, + "step": 25122 + }, + { + "epoch": 2.7589501427630134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4766433238983154, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7112433910369873, + "num_tokens": 650194483.0, + "step": 25123 + }, + { + "epoch": 2.759059960465627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.394298553466797, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7056331634521484, + "num_tokens": 650220021.0, + "step": 25124 + }, + { + "epoch": 2.759169778168241, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469771146774292, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7119187116622925, + "num_tokens": 650246245.0, + "step": 25125 + }, + { + "epoch": 2.7592795958708543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4251275062561035, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7016848921775818, + "num_tokens": 650272698.0, + "step": 25126 + }, + { + "epoch": 2.759389413573468, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4523866176605225, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.718337893486023, + "num_tokens": 650297928.0, + "step": 25127 + }, + { + "epoch": 2.7594992312760818, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5390961170196533, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7034509778022766, + "num_tokens": 650323221.0, + "step": 25128 + }, + { + "epoch": 2.7596090489786955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1455442905426025, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7234686613082886, + "num_tokens": 650354418.0, + "step": 25129 + }, + { + "epoch": 2.7597188666813093, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5339016914367676, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7395102977752686, + "num_tokens": 650376365.0, + "step": 25130 + }, + { + "epoch": 2.7598286843839226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5242044925689697, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.700201153755188, + "num_tokens": 650400045.0, + "step": 25131 + }, + { + "epoch": 2.7599385020865363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9164626598358154, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7211995124816895, + "num_tokens": 650418615.0, + "step": 25132 + }, + { + "epoch": 2.76004831978915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3566102981567383, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7344932556152344, + "num_tokens": 650443488.0, + "step": 25133 + }, + { + "epoch": 2.7601581374917634, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.290431261062622, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.706598162651062, + "num_tokens": 650473019.0, + "step": 25134 + }, + { + "epoch": 2.760267955194377, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3328990936279297, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7232842445373535, + "num_tokens": 650499245.0, + "step": 25135 + }, + { + "epoch": 2.760377772896991, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5675535202026367, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7158915400505066, + "num_tokens": 650522784.0, + "step": 25136 + }, + { + "epoch": 2.7604875905996047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6242547035217285, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7192649245262146, + "num_tokens": 650543810.0, + "step": 25137 + }, + { + "epoch": 2.7605974083022184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5093319416046143, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.727400541305542, + "num_tokens": 650565417.0, + "step": 25138 + }, + { + "epoch": 2.7607072260048318, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2345170974731445, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7173104286193848, + "num_tokens": 650596977.0, + "step": 25139 + }, + { + "epoch": 2.7608170437074455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.455660343170166, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7047937512397766, + "num_tokens": 650621860.0, + "step": 25140 + }, + { + "epoch": 2.7609268614100593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4307990074157715, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7123934626579285, + "num_tokens": 650646498.0, + "step": 25141 + }, + { + "epoch": 2.761036679112673, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 32.38726043701172, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7349647283554077, + "num_tokens": 650664077.0, + "step": 25142 + }, + { + "epoch": 2.761146496815287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.734269380569458, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7094969749450684, + "num_tokens": 650686151.0, + "step": 25143 + }, + { + "epoch": 2.7612563145179, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495241165161133, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6862589120864868, + "num_tokens": 650714203.0, + "step": 25144 + }, + { + "epoch": 2.761366132220514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.816318988800049, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7352340221405029, + "num_tokens": 650734054.0, + "step": 25145 + }, + { + "epoch": 2.7614759499231276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.555469274520874, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.693111002445221, + "num_tokens": 650761726.0, + "step": 25146 + }, + { + "epoch": 2.7615857676257414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5879950523376465, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7175390720367432, + "num_tokens": 650784206.0, + "step": 25147 + }, + { + "epoch": 2.761695585328355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.54897403717041, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7063167691230774, + "num_tokens": 650807736.0, + "step": 25148 + }, + { + "epoch": 2.7618054030309684, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3933486938476562, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7234962582588196, + "num_tokens": 650830542.0, + "step": 25149 + }, + { + "epoch": 2.761915220733582, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3100411891937256, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7221250534057617, + "num_tokens": 650859543.0, + "step": 25150 + }, + { + "epoch": 2.762025038436196, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.572298526763916, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7273039817810059, + "num_tokens": 650881841.0, + "step": 25151 + }, + { + "epoch": 2.7621348561388097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5976064205169678, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7299032211303711, + "num_tokens": 650906005.0, + "step": 25152 + }, + { + "epoch": 2.7622446738414235, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2781550884246826, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7196421027183533, + "num_tokens": 650933378.0, + "step": 25153 + }, + { + "epoch": 2.762354491544037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4481823444366455, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.696877121925354, + "num_tokens": 650960849.0, + "step": 25154 + }, + { + "epoch": 2.7624643092466505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.700014114379883, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7122014760971069, + "num_tokens": 650990245.0, + "step": 25155 + }, + { + "epoch": 2.7625741269492643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8730082511901855, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.726128101348877, + "num_tokens": 651007513.0, + "step": 25156 + }, + { + "epoch": 2.7626839446518776, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5033063888549805, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7176799774169922, + "num_tokens": 651033184.0, + "step": 25157 + }, + { + "epoch": 2.762793762354492, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3576531410217285, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7138461470603943, + "num_tokens": 651061072.0, + "step": 25158 + }, + { + "epoch": 2.762903580057105, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.49072527885437, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6842691898345947, + "num_tokens": 651086402.0, + "step": 25159 + }, + { + "epoch": 2.763013397759719, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4705352783203125, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7105206847190857, + "num_tokens": 651110355.0, + "step": 25160 + }, + { + "epoch": 2.7631232154623326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.437915325164795, + "learning_rate": 1e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7699579000473022, + "num_tokens": 651132937.0, + "step": 25161 + }, + { + "epoch": 2.763233033164946, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.425574779510498, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7137524485588074, + "num_tokens": 651159254.0, + "step": 25162 + }, + { + "epoch": 2.7633428508675597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4579391479492188, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7027083039283752, + "num_tokens": 651183771.0, + "step": 25163 + }, + { + "epoch": 2.7634526685701735, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4000749588012695, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7159135341644287, + "num_tokens": 651210647.0, + "step": 25164 + }, + { + "epoch": 2.763562486272787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.604093551635742, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7108781337738037, + "num_tokens": 651232450.0, + "step": 25165 + }, + { + "epoch": 2.763672303975401, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.236401081085205, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6944369077682495, + "num_tokens": 651262363.0, + "step": 25166 + }, + { + "epoch": 2.7637821216780143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.253913640975952, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.7062402367591858, + "num_tokens": 651294423.0, + "step": 25167 + }, + { + "epoch": 2.763891939380628, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4373207092285156, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.7048102617263794, + "num_tokens": 651320244.0, + "step": 25168 + }, + { + "epoch": 2.764001757083242, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4893174171447754, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7135486006736755, + "num_tokens": 651345228.0, + "step": 25169 + }, + { + "epoch": 2.7641115747858556, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.377197742462158, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7239282131195068, + "num_tokens": 651370252.0, + "step": 25170 + }, + { + "epoch": 2.7642213924884693, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1788856983184814, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7229433655738831, + "num_tokens": 651399536.0, + "step": 25171 + }, + { + "epoch": 2.7643312101910826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5131795406341553, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6935677528381348, + "num_tokens": 651427551.0, + "step": 25172 + }, + { + "epoch": 2.7644410278936964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.300168752670288, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.7052081823348999, + "num_tokens": 651456709.0, + "step": 25173 + }, + { + "epoch": 2.76455084559631, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5040457248687744, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.711536705493927, + "num_tokens": 651483211.0, + "step": 25174 + }, + { + "epoch": 2.764660663298924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8000144958496094, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7290397882461548, + "num_tokens": 651503020.0, + "step": 25175 + }, + { + "epoch": 2.7647704810015377, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.486315965652466, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7105374336242676, + "num_tokens": 651527231.0, + "step": 25176 + }, + { + "epoch": 2.764880298704151, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.374084711074829, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7261703014373779, + "num_tokens": 651554015.0, + "step": 25177 + }, + { + "epoch": 2.7649901164067647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4726500511169434, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7157946825027466, + "num_tokens": 651580250.0, + "step": 25178 + }, + { + "epoch": 2.7650999341093785, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.281667709350586, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7205761671066284, + "num_tokens": 651606831.0, + "step": 25179 + }, + { + "epoch": 2.7652097518119922, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5331833362579346, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7166008353233337, + "num_tokens": 651629974.0, + "step": 25180 + }, + { + "epoch": 2.765319569514606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2805492877960205, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7169556617736816, + "num_tokens": 651657057.0, + "step": 25181 + }, + { + "epoch": 2.7654293872172193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.380340099334717, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7289742827415466, + "num_tokens": 651682385.0, + "step": 25182 + }, + { + "epoch": 2.765539204919833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498889684677124, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7078087329864502, + "num_tokens": 651706546.0, + "step": 25183 + }, + { + "epoch": 2.765649022622447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.189749240875244, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7325623631477356, + "num_tokens": 651734169.0, + "step": 25184 + }, + { + "epoch": 2.76575884032506, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4416322708129883, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7208523750305176, + "num_tokens": 651760196.0, + "step": 25185 + }, + { + "epoch": 2.765868658027674, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2762579917907715, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.7016317844390869, + "num_tokens": 651791156.0, + "step": 25186 + }, + { + "epoch": 2.7659784757302877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8700039386749268, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7358349561691284, + "num_tokens": 651809139.0, + "step": 25187 + }, + { + "epoch": 2.7660882934329014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6922574043273926, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.730017900466919, + "num_tokens": 651829919.0, + "step": 25188 + }, + { + "epoch": 2.766198111135515, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.326117753982544, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6969476938247681, + "num_tokens": 651859312.0, + "step": 25189 + }, + { + "epoch": 2.7663079288381285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2362220287323, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7126936912536621, + "num_tokens": 651889055.0, + "step": 25190 + }, + { + "epoch": 2.7664177465407422, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.147412300109863, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7530789971351624, + "num_tokens": 651906880.0, + "step": 25191 + }, + { + "epoch": 2.766527564243356, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.559847831726074, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7254542112350464, + "num_tokens": 651929096.0, + "step": 25192 + }, + { + "epoch": 2.7666373819459698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.810455322265625, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6991587281227112, + "num_tokens": 651951758.0, + "step": 25193 + }, + { + "epoch": 2.7667471996485835, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4751501083374023, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.710158109664917, + "num_tokens": 651976696.0, + "step": 25194 + }, + { + "epoch": 2.766857017351197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3578083515167236, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7272061109542847, + "num_tokens": 652004851.0, + "step": 25195 + }, + { + "epoch": 2.7669668350538106, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.651543617248535, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7137550115585327, + "num_tokens": 652027235.0, + "step": 25196 + }, + { + "epoch": 2.7670766527564243, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.379795789718628, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7093239426612854, + "num_tokens": 652053645.0, + "step": 25197 + }, + { + "epoch": 2.767186470459038, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.384289503097534, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.7025569677352905, + "num_tokens": 652080875.0, + "step": 25198 + }, + { + "epoch": 2.767296288161652, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4846622943878174, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.7040726542472839, + "num_tokens": 652105946.0, + "step": 25199 + }, + { + "epoch": 2.767406105864265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.431521415710449, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.715401291847229, + "num_tokens": 652132432.0, + "step": 25200 + }, + { + "epoch": 2.767515923566879, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0638387203216553, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6856915950775146, + "num_tokens": 652168571.0, + "step": 25201 + }, + { + "epoch": 2.7676257412694927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4990241527557373, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7204834222793579, + "num_tokens": 652192561.0, + "step": 25202 + }, + { + "epoch": 2.7677355589721064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5732884407043457, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7135812640190125, + "num_tokens": 652216199.0, + "step": 25203 + }, + { + "epoch": 2.76784537667472, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.528512954711914, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.7005292773246765, + "num_tokens": 652240440.0, + "step": 25204 + }, + { + "epoch": 2.7679551943773335, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5157899856567383, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7091841101646423, + "num_tokens": 652264582.0, + "step": 25205 + }, + { + "epoch": 2.7680650120799473, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.344480276107788, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7334433794021606, + "num_tokens": 652291585.0, + "step": 25206 + }, + { + "epoch": 2.768174829782561, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2574374675750732, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6950873136520386, + "num_tokens": 652323196.0, + "step": 25207 + }, + { + "epoch": 2.7682846474851748, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400498867034912, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6864838004112244, + "num_tokens": 652355899.0, + "step": 25208 + }, + { + "epoch": 2.7683944651877885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.510605812072754, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6887918710708618, + "num_tokens": 652381810.0, + "step": 25209 + }, + { + "epoch": 2.768504282890402, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6308178901672363, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7336678504943848, + "num_tokens": 652403246.0, + "step": 25210 + }, + { + "epoch": 2.7686141005930156, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5270144939422607, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6995262503623962, + "num_tokens": 652427886.0, + "step": 25211 + }, + { + "epoch": 2.7687239182956294, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.315702438354492, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7187900543212891, + "num_tokens": 652454926.0, + "step": 25212 + }, + { + "epoch": 2.7688337359982427, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4019625186920166, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7296887636184692, + "num_tokens": 652481219.0, + "step": 25213 + }, + { + "epoch": 2.7689435537008564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.254892110824585, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.704294741153717, + "num_tokens": 652507642.0, + "step": 25214 + }, + { + "epoch": 2.76905337140347, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5053491592407227, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7411551475524902, + "num_tokens": 652530668.0, + "step": 25215 + }, + { + "epoch": 2.769163189106084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5789082050323486, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7207082509994507, + "num_tokens": 652552515.0, + "step": 25216 + }, + { + "epoch": 2.7692730068086977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.583357334136963, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7333395481109619, + "num_tokens": 652572944.0, + "step": 25217 + }, + { + "epoch": 2.769382824511311, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.249119758605957, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7068585157394409, + "num_tokens": 652601639.0, + "step": 25218 + }, + { + "epoch": 2.7694926422139248, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.425515651702881, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.696485161781311, + "num_tokens": 652626592.0, + "step": 25219 + }, + { + "epoch": 2.7696024599165385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.338867664337158, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.686779797077179, + "num_tokens": 652655322.0, + "step": 25220 + }, + { + "epoch": 2.7697122776191523, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5737388134002686, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7310985326766968, + "num_tokens": 652679234.0, + "step": 25221 + }, + { + "epoch": 2.769822095321766, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7691705226898193, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7171027660369873, + "num_tokens": 652699212.0, + "step": 25222 + }, + { + "epoch": 2.7699319130243794, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3597185611724854, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7210468053817749, + "num_tokens": 652726972.0, + "step": 25223 + }, + { + "epoch": 2.770041730726993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6112964153289795, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7332792282104492, + "num_tokens": 652748924.0, + "step": 25224 + }, + { + "epoch": 2.770151548429607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6171457767486572, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.7072247266769409, + "num_tokens": 652772025.0, + "step": 25225 + }, + { + "epoch": 2.7702613661322206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3846161365509033, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7319512367248535, + "num_tokens": 652798692.0, + "step": 25226 + }, + { + "epoch": 2.7703711838348344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2183659076690674, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7097894549369812, + "num_tokens": 652830187.0, + "step": 25227 + }, + { + "epoch": 2.7704810015374477, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.448138475418091, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7226717472076416, + "num_tokens": 652853772.0, + "step": 25228 + }, + { + "epoch": 2.7705908192400615, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4270377159118652, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7137154340744019, + "num_tokens": 652879406.0, + "step": 25229 + }, + { + "epoch": 2.770700636942675, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.433053493499756, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7338612079620361, + "num_tokens": 652904322.0, + "step": 25230 + }, + { + "epoch": 2.770810454645289, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324965000152588, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7190911769866943, + "num_tokens": 652932569.0, + "step": 25231 + }, + { + "epoch": 2.7709202723479027, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4489872455596924, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7127050161361694, + "num_tokens": 652957935.0, + "step": 25232 + }, + { + "epoch": 2.771030090050516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4300644397735596, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7067607045173645, + "num_tokens": 652983410.0, + "step": 25233 + }, + { + "epoch": 2.77113990775313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4027695655822754, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7037622928619385, + "num_tokens": 653009975.0, + "step": 25234 + }, + { + "epoch": 2.7712497254557436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.327739715576172, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7284892797470093, + "num_tokens": 653036431.0, + "step": 25235 + }, + { + "epoch": 2.771359543158357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5761759281158447, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7288105487823486, + "num_tokens": 653058546.0, + "step": 25236 + }, + { + "epoch": 2.771469360860971, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.383172035217285, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7425279021263123, + "num_tokens": 653083335.0, + "step": 25237 + }, + { + "epoch": 2.7715791785635844, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.313457489013672, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7279707789421082, + "num_tokens": 653110027.0, + "step": 25238 + }, + { + "epoch": 2.771688996266198, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3815932273864746, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7207996845245361, + "num_tokens": 653136265.0, + "step": 25239 + }, + { + "epoch": 2.771798813968812, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6213958263397217, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7086490988731384, + "num_tokens": 653159842.0, + "step": 25240 + }, + { + "epoch": 2.771908631671425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4513700008392334, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7106789350509644, + "num_tokens": 653186815.0, + "step": 25241 + }, + { + "epoch": 2.772018449374039, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5088367462158203, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.7008872628211975, + "num_tokens": 653211266.0, + "step": 25242 + }, + { + "epoch": 2.7721282670766527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3499457836151123, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.7039873600006104, + "num_tokens": 653240510.0, + "step": 25243 + }, + { + "epoch": 2.7722380847792665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.663170576095581, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6993942260742188, + "num_tokens": 653261176.0, + "step": 25244 + }, + { + "epoch": 2.7723479024818802, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2978262901306152, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6942026019096375, + "num_tokens": 653290730.0, + "step": 25245 + }, + { + "epoch": 2.7724577201844935, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.465517997741699, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7027263641357422, + "num_tokens": 653316082.0, + "step": 25246 + }, + { + "epoch": 2.7725675378871073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4473161697387695, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7259133458137512, + "num_tokens": 653341875.0, + "step": 25247 + }, + { + "epoch": 2.772677355589721, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.578235626220703, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7343894243240356, + "num_tokens": 653366545.0, + "step": 25248 + }, + { + "epoch": 2.772787173292335, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.629143238067627, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7083274126052856, + "num_tokens": 653391297.0, + "step": 25249 + }, + { + "epoch": 2.7728969909949486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5021450519561768, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.717328667640686, + "num_tokens": 653414622.0, + "step": 25250 + }, + { + "epoch": 2.773006808697562, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5479509830474854, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7357251644134521, + "num_tokens": 653438389.0, + "step": 25251 + }, + { + "epoch": 2.7731166264001756, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495634078979492, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7088724374771118, + "num_tokens": 653462007.0, + "step": 25252 + }, + { + "epoch": 2.7732264441027894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.225801944732666, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7053979635238647, + "num_tokens": 653491402.0, + "step": 25253 + }, + { + "epoch": 2.773336261805403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3386921882629395, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6938204765319824, + "num_tokens": 653521194.0, + "step": 25254 + }, + { + "epoch": 2.773446079508017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.496708631515503, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7077301144599915, + "num_tokens": 653546587.0, + "step": 25255 + }, + { + "epoch": 2.7735558972106302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4296112060546875, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7101536989212036, + "num_tokens": 653572844.0, + "step": 25256 + }, + { + "epoch": 2.773665714913244, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.207268714904785, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7019342184066772, + "num_tokens": 653604279.0, + "step": 25257 + }, + { + "epoch": 2.7737755326158577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2611453533172607, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7034351825714111, + "num_tokens": 653630499.0, + "step": 25258 + }, + { + "epoch": 2.7738853503184715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.575082302093506, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7136958837509155, + "num_tokens": 653652860.0, + "step": 25259 + }, + { + "epoch": 2.7739951680210853, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5656204223632812, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7055556774139404, + "num_tokens": 653678067.0, + "step": 25260 + }, + { + "epoch": 2.7741049857236986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3111979961395264, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7100368142127991, + "num_tokens": 653705602.0, + "step": 25261 + }, + { + "epoch": 2.7742148034263123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.601118326187134, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7371690273284912, + "num_tokens": 653728381.0, + "step": 25262 + }, + { + "epoch": 2.774324621128926, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5566632747650146, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7098586559295654, + "num_tokens": 653752289.0, + "step": 25263 + }, + { + "epoch": 2.7744344388315394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6341869831085205, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7263755798339844, + "num_tokens": 653776572.0, + "step": 25264 + }, + { + "epoch": 2.774544256534153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4894614219665527, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7213817834854126, + "num_tokens": 653801743.0, + "step": 25265 + }, + { + "epoch": 2.774654074236767, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.451551675796509, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6976037621498108, + "num_tokens": 653828938.0, + "step": 25266 + }, + { + "epoch": 2.7747638919393807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400656223297119, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7138313055038452, + "num_tokens": 653855618.0, + "step": 25267 + }, + { + "epoch": 2.7748737096419944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.560657024383545, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7274739146232605, + "num_tokens": 653878047.0, + "step": 25268 + }, + { + "epoch": 2.7749835273446077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.193887710571289, + "learning_rate": 1e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6904482245445251, + "num_tokens": 653910661.0, + "step": 25269 + }, + { + "epoch": 2.7750933450472215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.682307243347168, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7279634475708008, + "num_tokens": 653931833.0, + "step": 25270 + }, + { + "epoch": 2.7752031627498353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4759323596954346, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7063265442848206, + "num_tokens": 653956526.0, + "step": 25271 + }, + { + "epoch": 2.775312980452449, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4177656173706055, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7058464288711548, + "num_tokens": 653981615.0, + "step": 25272 + }, + { + "epoch": 2.7754227981550628, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.298020601272583, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7208988070487976, + "num_tokens": 654009960.0, + "step": 25273 + }, + { + "epoch": 2.775532615857676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6146323680877686, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7125661373138428, + "num_tokens": 654032444.0, + "step": 25274 + }, + { + "epoch": 2.77564243356029, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.953714609146118, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7140018343925476, + "num_tokens": 654051166.0, + "step": 25275 + }, + { + "epoch": 2.7757522512629036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.511275053024292, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6990609169006348, + "num_tokens": 654076902.0, + "step": 25276 + }, + { + "epoch": 2.7758620689655173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.667011260986328, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7420735359191895, + "num_tokens": 654096938.0, + "step": 25277 + }, + { + "epoch": 2.775971886668131, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.358814001083374, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7216689586639404, + "num_tokens": 654122767.0, + "step": 25278 + }, + { + "epoch": 2.7760817043707444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6299989223480225, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7143791913986206, + "num_tokens": 654145677.0, + "step": 25279 + }, + { + "epoch": 2.776191522073358, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4646151065826416, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.719037652015686, + "num_tokens": 654169535.0, + "step": 25280 + }, + { + "epoch": 2.776301339775972, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6370973587036133, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7464026808738708, + "num_tokens": 654192822.0, + "step": 25281 + }, + { + "epoch": 2.7764111574785857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0442843437194824, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.706789493560791, + "num_tokens": 654225419.0, + "step": 25282 + }, + { + "epoch": 2.7765209751811994, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3987162113189697, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7173402309417725, + "num_tokens": 654249585.0, + "step": 25283 + }, + { + "epoch": 2.7766307928838128, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5305280685424805, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7127882242202759, + "num_tokens": 654273537.0, + "step": 25284 + }, + { + "epoch": 2.7767406105864265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3008370399475098, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7045351266860962, + "num_tokens": 654303510.0, + "step": 25285 + }, + { + "epoch": 2.7768504282890403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1758534908294678, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6985787153244019, + "num_tokens": 654333327.0, + "step": 25286 + }, + { + "epoch": 2.7769602459916536, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.717193841934204, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7152758836746216, + "num_tokens": 654353996.0, + "step": 25287 + }, + { + "epoch": 2.777070063694268, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7459347248077393, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.728024959564209, + "num_tokens": 654374873.0, + "step": 25288 + }, + { + "epoch": 2.777179881396881, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5897257328033447, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7307171821594238, + "num_tokens": 654397303.0, + "step": 25289 + }, + { + "epoch": 2.777289699099495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6404175758361816, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7013062238693237, + "num_tokens": 654421715.0, + "step": 25290 + }, + { + "epoch": 2.7773995168021086, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443019151687622, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7084845304489136, + "num_tokens": 654446136.0, + "step": 25291 + }, + { + "epoch": 2.777509334504722, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.596325397491455, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7417676448822021, + "num_tokens": 654469152.0, + "step": 25292 + }, + { + "epoch": 2.7776191522073357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2667958736419678, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7504950761795044, + "num_tokens": 654495377.0, + "step": 25293 + }, + { + "epoch": 2.7777289699099494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.616608142852783, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.711043119430542, + "num_tokens": 654516281.0, + "step": 25294 + }, + { + "epoch": 2.777838787612563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.572237014770508, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7210023403167725, + "num_tokens": 654540311.0, + "step": 25295 + }, + { + "epoch": 2.777948605315177, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.688122272491455, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7180774211883545, + "num_tokens": 654563102.0, + "step": 25296 + }, + { + "epoch": 2.7780584230177903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2937488555908203, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6990827918052673, + "num_tokens": 654592510.0, + "step": 25297 + }, + { + "epoch": 2.778168240720404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.424363613128662, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7207375764846802, + "num_tokens": 654615826.0, + "step": 25298 + }, + { + "epoch": 2.778278058423018, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.585693597793579, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.7049535512924194, + "num_tokens": 654641014.0, + "step": 25299 + }, + { + "epoch": 2.7783878761256315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3208746910095215, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7259746789932251, + "num_tokens": 654669484.0, + "step": 25300 + }, + { + "epoch": 2.7784976938282453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5819785594940186, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7088541984558105, + "num_tokens": 654691943.0, + "step": 25301 + }, + { + "epoch": 2.7786075115308586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1775290966033936, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7229671478271484, + "num_tokens": 654724098.0, + "step": 25302 + }, + { + "epoch": 2.7787173292334724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417463779449463, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7065829038619995, + "num_tokens": 654751649.0, + "step": 25303 + }, + { + "epoch": 2.778827146936086, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.751225233078003, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7264660000801086, + "num_tokens": 654774086.0, + "step": 25304 + }, + { + "epoch": 2.7789369646387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.361915111541748, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6959865093231201, + "num_tokens": 654800126.0, + "step": 25305 + }, + { + "epoch": 2.7790467823413136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4827511310577393, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.7024545073509216, + "num_tokens": 654828206.0, + "step": 25306 + }, + { + "epoch": 2.779156600043927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.727379560470581, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7555648684501648, + "num_tokens": 654847164.0, + "step": 25307 + }, + { + "epoch": 2.7792664177465407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6178927421569824, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7216516733169556, + "num_tokens": 654869336.0, + "step": 25308 + }, + { + "epoch": 2.7793762354491545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.254822015762329, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7231158018112183, + "num_tokens": 654897779.0, + "step": 25309 + }, + { + "epoch": 2.779486053151768, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8410122394561768, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7157454490661621, + "num_tokens": 654917220.0, + "step": 25310 + }, + { + "epoch": 2.779595870854382, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7149734497070312, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7095483541488647, + "num_tokens": 654939692.0, + "step": 25311 + }, + { + "epoch": 2.7797056885569953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4172677993774414, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6854528784751892, + "num_tokens": 654965842.0, + "step": 25312 + }, + { + "epoch": 2.779815506259609, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3406920433044434, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.7010738849639893, + "num_tokens": 654993830.0, + "step": 25313 + }, + { + "epoch": 2.779925323962223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.497490406036377, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7186700701713562, + "num_tokens": 655015958.0, + "step": 25314 + }, + { + "epoch": 2.780035141664836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.296252965927124, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7348170280456543, + "num_tokens": 655044444.0, + "step": 25315 + }, + { + "epoch": 2.78014495936745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3180673122406006, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7214640378952026, + "num_tokens": 655071755.0, + "step": 25316 + }, + { + "epoch": 2.7802547770700636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4166836738586426, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6847870349884033, + "num_tokens": 655099814.0, + "step": 25317 + }, + { + "epoch": 2.7803645947726774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.271146535873413, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.729874849319458, + "num_tokens": 655129052.0, + "step": 25318 + }, + { + "epoch": 2.780474412475291, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4527809619903564, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.731998085975647, + "num_tokens": 655153983.0, + "step": 25319 + }, + { + "epoch": 2.7805842301779045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.411905527114868, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7044464349746704, + "num_tokens": 655180614.0, + "step": 25320 + }, + { + "epoch": 2.780694047880518, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2871198654174805, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6997009515762329, + "num_tokens": 655209629.0, + "step": 25321 + }, + { + "epoch": 2.780803865583132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4737141132354736, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.722329318523407, + "num_tokens": 655234356.0, + "step": 25322 + }, + { + "epoch": 2.7809136832857457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2954752445220947, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.712100625038147, + "num_tokens": 655263774.0, + "step": 25323 + }, + { + "epoch": 2.7810235009883595, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3718318939208984, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.720700204372406, + "num_tokens": 655290867.0, + "step": 25324 + }, + { + "epoch": 2.781133318690973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2882111072540283, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.7016178369522095, + "num_tokens": 655319273.0, + "step": 25325 + }, + { + "epoch": 2.7812431363935866, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4375455379486084, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7324893474578857, + "num_tokens": 655344099.0, + "step": 25326 + }, + { + "epoch": 2.7813529540962003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2435030937194824, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6915419101715088, + "num_tokens": 655376303.0, + "step": 25327 + }, + { + "epoch": 2.781462771798814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.363910675048828, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.719490110874176, + "num_tokens": 655404148.0, + "step": 25328 + }, + { + "epoch": 2.781572589501428, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6370904445648193, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7054992914199829, + "num_tokens": 655425556.0, + "step": 25329 + }, + { + "epoch": 2.781682407204041, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4784300327301025, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7338095307350159, + "num_tokens": 655448683.0, + "step": 25330 + }, + { + "epoch": 2.781792224906655, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2694387435913086, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7211251854896545, + "num_tokens": 655475223.0, + "step": 25331 + }, + { + "epoch": 2.7819020426092687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1824123859405518, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7241219282150269, + "num_tokens": 655503829.0, + "step": 25332 + }, + { + "epoch": 2.7820118603118824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396122932434082, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7129217982292175, + "num_tokens": 655530470.0, + "step": 25333 + }, + { + "epoch": 2.782121678014496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5897834300994873, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7177889943122864, + "num_tokens": 655553107.0, + "step": 25334 + }, + { + "epoch": 2.7822314957171095, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4042367935180664, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7173658609390259, + "num_tokens": 655579577.0, + "step": 25335 + }, + { + "epoch": 2.7823413134197232, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5152747631073, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7279869318008423, + "num_tokens": 655604063.0, + "step": 25336 + }, + { + "epoch": 2.782451131122337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.44547963142395, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7302988171577454, + "num_tokens": 655629811.0, + "step": 25337 + }, + { + "epoch": 2.7825609488249503, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.439889669418335, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6985863447189331, + "num_tokens": 655658096.0, + "step": 25338 + }, + { + "epoch": 2.7826707665275645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.38843035697937, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7039631009101868, + "num_tokens": 655686339.0, + "step": 25339 + }, + { + "epoch": 2.782780584230178, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.584340810775757, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6994671821594238, + "num_tokens": 655710433.0, + "step": 25340 + }, + { + "epoch": 2.7828904019327916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.362044095993042, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7115007638931274, + "num_tokens": 655738374.0, + "step": 25341 + }, + { + "epoch": 2.7830002196354053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4940826892852783, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7140785455703735, + "num_tokens": 655763612.0, + "step": 25342 + }, + { + "epoch": 2.7831100373380186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.303588390350342, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.701489269733429, + "num_tokens": 655794128.0, + "step": 25343 + }, + { + "epoch": 2.7832198550406324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7295451164245605, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7228631973266602, + "num_tokens": 655817587.0, + "step": 25344 + }, + { + "epoch": 2.783329672743246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.717862129211426, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7174935340881348, + "num_tokens": 655840349.0, + "step": 25345 + }, + { + "epoch": 2.78343949044586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5560898780822754, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7232975363731384, + "num_tokens": 655863924.0, + "step": 25346 + }, + { + "epoch": 2.7835493081484737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6453030109405518, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.718892514705658, + "num_tokens": 655886611.0, + "step": 25347 + }, + { + "epoch": 2.783659125851087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.354651689529419, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7153924703598022, + "num_tokens": 655914869.0, + "step": 25348 + }, + { + "epoch": 2.7837689435537007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4486634731292725, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7223876118659973, + "num_tokens": 655941675.0, + "step": 25349 + }, + { + "epoch": 2.7838787612563145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.308199405670166, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6917810440063477, + "num_tokens": 655971798.0, + "step": 25350 + }, + { + "epoch": 2.7839885789589283, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43476939201355, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7360498905181885, + "num_tokens": 655995942.0, + "step": 25351 + }, + { + "epoch": 2.784098396661542, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0986194610595703, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6964858174324036, + "num_tokens": 656030686.0, + "step": 25352 + }, + { + "epoch": 2.7842082143641553, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.245636463165283, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7139415740966797, + "num_tokens": 656060557.0, + "step": 25353 + }, + { + "epoch": 2.784318032066769, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3065221309661865, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7347559928894043, + "num_tokens": 656086004.0, + "step": 25354 + }, + { + "epoch": 2.784427849769383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.115692138671875, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7059047222137451, + "num_tokens": 656111463.0, + "step": 25355 + }, + { + "epoch": 2.7845376674719966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526909112930298, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.714297354221344, + "num_tokens": 656136462.0, + "step": 25356 + }, + { + "epoch": 2.7846474851746104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.568507432937622, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7164425849914551, + "num_tokens": 656160109.0, + "step": 25357 + }, + { + "epoch": 2.7847573028772237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4700067043304443, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7062689065933228, + "num_tokens": 656184727.0, + "step": 25358 + }, + { + "epoch": 2.7848671205798374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.801525354385376, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7197228670120239, + "num_tokens": 656204424.0, + "step": 25359 + }, + { + "epoch": 2.784976938282451, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.405130386352539, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7042185664176941, + "num_tokens": 656231785.0, + "step": 25360 + }, + { + "epoch": 2.785086755985065, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6327733993530273, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7153874635696411, + "num_tokens": 656255960.0, + "step": 25361 + }, + { + "epoch": 2.7851965736876787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1108438968658447, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6883779168128967, + "num_tokens": 656289877.0, + "step": 25362 + }, + { + "epoch": 2.785306391390292, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5969088077545166, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.699790894985199, + "num_tokens": 656313410.0, + "step": 25363 + }, + { + "epoch": 2.7854162090929058, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5325262546539307, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6954225897789001, + "num_tokens": 656338139.0, + "step": 25364 + }, + { + "epoch": 2.7855260267955195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4319064617156982, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.702037513256073, + "num_tokens": 656365724.0, + "step": 25365 + }, + { + "epoch": 2.785635844498133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3870532512664795, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7167909145355225, + "num_tokens": 656397010.0, + "step": 25366 + }, + { + "epoch": 2.7857456622007466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.307929754257202, + "learning_rate": 1e-06, + "loss": 1.084, + "mean_token_accuracy": 0.6844652891159058, + "num_tokens": 656426045.0, + "step": 25367 + }, + { + "epoch": 2.7858554799033604, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7755959033966064, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7329310178756714, + "num_tokens": 656445907.0, + "step": 25368 + }, + { + "epoch": 2.785965297605974, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4271764755249023, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7042950391769409, + "num_tokens": 656471183.0, + "step": 25369 + }, + { + "epoch": 2.786075115308588, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3463196754455566, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.710095226764679, + "num_tokens": 656499016.0, + "step": 25370 + }, + { + "epoch": 2.786184933011201, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.551220178604126, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7097193002700806, + "num_tokens": 656523225.0, + "step": 25371 + }, + { + "epoch": 2.786294750713815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4855923652648926, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.691896915435791, + "num_tokens": 656548239.0, + "step": 25372 + }, + { + "epoch": 2.7864045684164287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3502156734466553, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7202355861663818, + "num_tokens": 656574403.0, + "step": 25373 + }, + { + "epoch": 2.7865143861190425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.508369207382202, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7265045642852783, + "num_tokens": 656597325.0, + "step": 25374 + }, + { + "epoch": 2.786624203821656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.378952741622925, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7101097106933594, + "num_tokens": 656624767.0, + "step": 25375 + }, + { + "epoch": 2.7867340215242695, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.433561325073242, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7141367197036743, + "num_tokens": 656650865.0, + "step": 25376 + }, + { + "epoch": 2.7868438392268833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.351710081100464, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7094985842704773, + "num_tokens": 656679434.0, + "step": 25377 + }, + { + "epoch": 2.786953656929497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2364449501037598, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7199503183364868, + "num_tokens": 656707461.0, + "step": 25378 + }, + { + "epoch": 2.787063474632111, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539565324783325, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7203465700149536, + "num_tokens": 656731067.0, + "step": 25379 + }, + { + "epoch": 2.7871732923347246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5743134021759033, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.692279577255249, + "num_tokens": 656756016.0, + "step": 25380 + }, + { + "epoch": 2.787283110037338, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6514945030212402, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7297980189323425, + "num_tokens": 656776780.0, + "step": 25381 + }, + { + "epoch": 2.7873929277399516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4729695320129395, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7079955339431763, + "num_tokens": 656802819.0, + "step": 25382 + }, + { + "epoch": 2.7875027454425654, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.627382278442383, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7128047943115234, + "num_tokens": 656825854.0, + "step": 25383 + }, + { + "epoch": 2.787612563145179, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8065919876098633, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7273781895637512, + "num_tokens": 656846330.0, + "step": 25384 + }, + { + "epoch": 2.787722380847793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5611488819122314, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7274748086929321, + "num_tokens": 656870508.0, + "step": 25385 + }, + { + "epoch": 2.787832198550406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.545042037963867, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7348916530609131, + "num_tokens": 656894746.0, + "step": 25386 + }, + { + "epoch": 2.78794201625302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.316467046737671, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7235154509544373, + "num_tokens": 656924634.0, + "step": 25387 + }, + { + "epoch": 2.7880518339556337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1440868377685547, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7247575521469116, + "num_tokens": 656954425.0, + "step": 25388 + }, + { + "epoch": 2.7881616516582475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3730881214141846, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7064160108566284, + "num_tokens": 656981932.0, + "step": 25389 + }, + { + "epoch": 2.7882714693608612, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3353254795074463, + "learning_rate": 1e-06, + "loss": 1.1093, + "mean_token_accuracy": 0.67917799949646, + "num_tokens": 657014208.0, + "step": 25390 + }, + { + "epoch": 2.7883812870634745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.145329475402832, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7111709117889404, + "num_tokens": 657043290.0, + "step": 25391 + }, + { + "epoch": 2.7884911047660883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6863749027252197, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7244329452514648, + "num_tokens": 657064427.0, + "step": 25392 + }, + { + "epoch": 2.788600922468702, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3839027881622314, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7161836624145508, + "num_tokens": 657089835.0, + "step": 25393 + }, + { + "epoch": 2.7887107401713154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.323251485824585, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7126936912536621, + "num_tokens": 657117918.0, + "step": 25394 + }, + { + "epoch": 2.788820557873929, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2116174697875977, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6844810247421265, + "num_tokens": 657150714.0, + "step": 25395 + }, + { + "epoch": 2.788930375576543, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4920549392700195, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7142782807350159, + "num_tokens": 657175174.0, + "step": 25396 + }, + { + "epoch": 2.7890401932791566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393448829650879, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.704654335975647, + "num_tokens": 657202885.0, + "step": 25397 + }, + { + "epoch": 2.7891500109817704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2795050144195557, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7098867297172546, + "num_tokens": 657231845.0, + "step": 25398 + }, + { + "epoch": 2.7892598286843837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2686736583709717, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7144716382026672, + "num_tokens": 657261026.0, + "step": 25399 + }, + { + "epoch": 2.7893696463869975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2573282718658447, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.7018779516220093, + "num_tokens": 657291041.0, + "step": 25400 + }, + { + "epoch": 2.7894794640896112, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1942455768585205, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7173597812652588, + "num_tokens": 657319078.0, + "step": 25401 + }, + { + "epoch": 2.789589281792225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.808891773223877, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.744537353515625, + "num_tokens": 657338584.0, + "step": 25402 + }, + { + "epoch": 2.7896990994948387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5671727657318115, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.705723762512207, + "num_tokens": 657361728.0, + "step": 25403 + }, + { + "epoch": 2.789808917197452, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5828664302825928, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7231510281562805, + "num_tokens": 657384977.0, + "step": 25404 + }, + { + "epoch": 2.789918734900066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 6.955898284912109, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6979329586029053, + "num_tokens": 657415883.0, + "step": 25405 + }, + { + "epoch": 2.7900285526026796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.691909074783325, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7515520453453064, + "num_tokens": 657438089.0, + "step": 25406 + }, + { + "epoch": 2.7901383703052933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.367023468017578, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.7030823230743408, + "num_tokens": 657467505.0, + "step": 25407 + }, + { + "epoch": 2.790248188007907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3397014141082764, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6952073574066162, + "num_tokens": 657496368.0, + "step": 25408 + }, + { + "epoch": 2.7903580057105204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.18660044670105, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7134210467338562, + "num_tokens": 657529212.0, + "step": 25409 + }, + { + "epoch": 2.790467823413134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.269263982772827, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7186955213546753, + "num_tokens": 657556847.0, + "step": 25410 + }, + { + "epoch": 2.790577641115748, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2250583171844482, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6958136558532715, + "num_tokens": 657587255.0, + "step": 25411 + }, + { + "epoch": 2.7906874588183617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7202234268188477, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.737389862537384, + "num_tokens": 657607807.0, + "step": 25412 + }, + { + "epoch": 2.7907972765209754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2739005088806152, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7412919402122498, + "num_tokens": 657632519.0, + "step": 25413 + }, + { + "epoch": 2.7909070942235887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3528342247009277, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6850032806396484, + "num_tokens": 657660615.0, + "step": 25414 + }, + { + "epoch": 2.7910169119262025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3694348335266113, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6944500207901001, + "num_tokens": 657688699.0, + "step": 25415 + }, + { + "epoch": 2.7911267296288162, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3068761825561523, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7180039882659912, + "num_tokens": 657717203.0, + "step": 25416 + }, + { + "epoch": 2.7912365473314296, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4052658081054688, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7110533118247986, + "num_tokens": 657743949.0, + "step": 25417 + }, + { + "epoch": 2.7913463650340438, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.146743059158325, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7064582109451294, + "num_tokens": 657774713.0, + "step": 25418 + }, + { + "epoch": 2.791456182736657, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 8.47952938079834, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7406803369522095, + "num_tokens": 657799743.0, + "step": 25419 + }, + { + "epoch": 2.791566000439271, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265185832977295, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7294472455978394, + "num_tokens": 657829342.0, + "step": 25420 + }, + { + "epoch": 2.7916758181418846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.44268536567688, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7401949763298035, + "num_tokens": 657854566.0, + "step": 25421 + }, + { + "epoch": 2.791785635844498, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.603731870651245, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.72629714012146, + "num_tokens": 657879366.0, + "step": 25422 + }, + { + "epoch": 2.7918954535471117, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.455467700958252, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7169305086135864, + "num_tokens": 657905414.0, + "step": 25423 + }, + { + "epoch": 2.7920052712497254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4049341678619385, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7157679796218872, + "num_tokens": 657931013.0, + "step": 25424 + }, + { + "epoch": 2.792115088952339, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.314739465713501, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6911589503288269, + "num_tokens": 657959293.0, + "step": 25425 + }, + { + "epoch": 2.792224906654953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3669841289520264, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6947059035301208, + "num_tokens": 657985708.0, + "step": 25426 + }, + { + "epoch": 2.7923347243575662, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.425816059112549, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7121001482009888, + "num_tokens": 658013079.0, + "step": 25427 + }, + { + "epoch": 2.79244454206018, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2961738109588623, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7020987272262573, + "num_tokens": 658042567.0, + "step": 25428 + }, + { + "epoch": 2.7925543597627938, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.257992744445801, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7179619073867798, + "num_tokens": 658072626.0, + "step": 25429 + }, + { + "epoch": 2.7926641774654075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2150418758392334, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.697132408618927, + "num_tokens": 658102681.0, + "step": 25430 + }, + { + "epoch": 2.7927739951680213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5113143920898438, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7356022000312805, + "num_tokens": 658125645.0, + "step": 25431 + }, + { + "epoch": 2.7928838128706346, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5605833530426025, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7227654457092285, + "num_tokens": 658148908.0, + "step": 25432 + }, + { + "epoch": 2.7929936305732483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5924789905548096, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7043587565422058, + "num_tokens": 658171824.0, + "step": 25433 + }, + { + "epoch": 2.793103448275862, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4255588054656982, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7246865630149841, + "num_tokens": 658196709.0, + "step": 25434 + }, + { + "epoch": 2.793213265978476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.874484062194824, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.734439492225647, + "num_tokens": 658216649.0, + "step": 25435 + }, + { + "epoch": 2.7933230836810896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3966526985168457, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7317206859588623, + "num_tokens": 658241009.0, + "step": 25436 + }, + { + "epoch": 2.793432901383703, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.494382381439209, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7050471901893616, + "num_tokens": 658267432.0, + "step": 25437 + }, + { + "epoch": 2.7935427190863167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.398552894592285, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7141689658164978, + "num_tokens": 658294948.0, + "step": 25438 + }, + { + "epoch": 2.7936525367889304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.493199586868286, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7299091219902039, + "num_tokens": 658319781.0, + "step": 25439 + }, + { + "epoch": 2.793762354491544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2861435413360596, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7074047327041626, + "num_tokens": 658347955.0, + "step": 25440 + }, + { + "epoch": 2.793872172194158, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7178826332092285, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7394566535949707, + "num_tokens": 658370227.0, + "step": 25441 + }, + { + "epoch": 2.7939819898967713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3162729740142822, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7156515121459961, + "num_tokens": 658399478.0, + "step": 25442 + }, + { + "epoch": 2.794091807599385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7703351974487305, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7184757590293884, + "num_tokens": 658419334.0, + "step": 25443 + }, + { + "epoch": 2.794201625301999, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.239501714706421, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7307149171829224, + "num_tokens": 658446821.0, + "step": 25444 + }, + { + "epoch": 2.794311443004612, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.299102306365967, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7424619197845459, + "num_tokens": 658472890.0, + "step": 25445 + }, + { + "epoch": 2.794421260707226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.27498722076416, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7106709480285645, + "num_tokens": 658501453.0, + "step": 25446 + }, + { + "epoch": 2.7945310784098396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3952884674072266, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7331296801567078, + "num_tokens": 658525789.0, + "step": 25447 + }, + { + "epoch": 2.7946408961124534, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6386263370513916, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7172688245773315, + "num_tokens": 658547999.0, + "step": 25448 + }, + { + "epoch": 2.794750713815067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7258644104003906, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7364471554756165, + "num_tokens": 658567804.0, + "step": 25449 + }, + { + "epoch": 2.7948605315176804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4169018268585205, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7084131240844727, + "num_tokens": 658595068.0, + "step": 25450 + }, + { + "epoch": 2.794970349220294, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5713272094726562, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7126839756965637, + "num_tokens": 658621032.0, + "step": 25451 + }, + { + "epoch": 2.795080166922908, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5879480838775635, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7209421992301941, + "num_tokens": 658641357.0, + "step": 25452 + }, + { + "epoch": 2.7951899846255217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4269142150878906, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6927424669265747, + "num_tokens": 658671439.0, + "step": 25453 + }, + { + "epoch": 2.7952998023281355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.52913236618042, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7134405374526978, + "num_tokens": 658696053.0, + "step": 25454 + }, + { + "epoch": 2.7954096200307488, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.463685989379883, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7269463539123535, + "num_tokens": 658721063.0, + "step": 25455 + }, + { + "epoch": 2.7955194377333625, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4047818183898926, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7345248460769653, + "num_tokens": 658746630.0, + "step": 25456 + }, + { + "epoch": 2.7956292554359763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3222033977508545, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7254971861839294, + "num_tokens": 658771703.0, + "step": 25457 + }, + { + "epoch": 2.79573907313859, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2595486640930176, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7209060192108154, + "num_tokens": 658800801.0, + "step": 25458 + }, + { + "epoch": 2.795848890841204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.184659719467163, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7076728343963623, + "num_tokens": 658832820.0, + "step": 25459 + }, + { + "epoch": 2.795958708543817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1410439014434814, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6843042373657227, + "num_tokens": 658868258.0, + "step": 25460 + }, + { + "epoch": 2.796068526246431, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5104236602783203, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7145975828170776, + "num_tokens": 658891424.0, + "step": 25461 + }, + { + "epoch": 2.7961783439490446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2952911853790283, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6885271668434143, + "num_tokens": 658919905.0, + "step": 25462 + }, + { + "epoch": 2.7962881616516584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 8.555290222167969, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7143336534500122, + "num_tokens": 658945164.0, + "step": 25463 + }, + { + "epoch": 2.796397979354272, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.525608539581299, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7385499477386475, + "num_tokens": 658967595.0, + "step": 25464 + }, + { + "epoch": 2.7965077970568855, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2152934074401855, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7141647338867188, + "num_tokens": 658999509.0, + "step": 25465 + }, + { + "epoch": 2.796617614759499, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3294787406921387, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7083114385604858, + "num_tokens": 659028209.0, + "step": 25466 + }, + { + "epoch": 2.796727432462113, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3396530151367188, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.716429591178894, + "num_tokens": 659054733.0, + "step": 25467 + }, + { + "epoch": 2.7968372501647263, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.321838140487671, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.6786233186721802, + "num_tokens": 659086566.0, + "step": 25468 + }, + { + "epoch": 2.7969470678673405, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.37962007522583, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6978794932365417, + "num_tokens": 659113076.0, + "step": 25469 + }, + { + "epoch": 2.797056885569954, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3191332817077637, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7279255390167236, + "num_tokens": 659140510.0, + "step": 25470 + }, + { + "epoch": 2.7971667032725676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.220630168914795, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7120909690856934, + "num_tokens": 659171065.0, + "step": 25471 + }, + { + "epoch": 2.7972765209751813, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.423231840133667, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6990798711776733, + "num_tokens": 659198950.0, + "step": 25472 + }, + { + "epoch": 2.7973863386777946, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1965622901916504, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6965641975402832, + "num_tokens": 659228794.0, + "step": 25473 + }, + { + "epoch": 2.7974961563804084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.566502571105957, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7087329626083374, + "num_tokens": 659250785.0, + "step": 25474 + }, + { + "epoch": 2.797605974083022, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5683817863464355, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7490575313568115, + "num_tokens": 659273650.0, + "step": 25475 + }, + { + "epoch": 2.797715791785636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4889299869537354, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.735175371170044, + "num_tokens": 659297636.0, + "step": 25476 + }, + { + "epoch": 2.7978256094882497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3317577838897705, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.714113712310791, + "num_tokens": 659325961.0, + "step": 25477 + }, + { + "epoch": 2.797935427190863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.268795967102051, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.696711540222168, + "num_tokens": 659355236.0, + "step": 25478 + }, + { + "epoch": 2.7980452448934767, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.0291748046875, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7309818267822266, + "num_tokens": 659380549.0, + "step": 25479 + }, + { + "epoch": 2.7981550625960905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4672627449035645, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7347673177719116, + "num_tokens": 659404170.0, + "step": 25480 + }, + { + "epoch": 2.7982648802987042, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.352877616882324, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7401617765426636, + "num_tokens": 659429358.0, + "step": 25481 + }, + { + "epoch": 2.798374698001318, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.550922393798828, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7190775871276855, + "num_tokens": 659452045.0, + "step": 25482 + }, + { + "epoch": 2.7984845157039313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.253457546234131, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7091497778892517, + "num_tokens": 659482183.0, + "step": 25483 + }, + { + "epoch": 2.798594333406545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.607292413711548, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7066421508789062, + "num_tokens": 659506878.0, + "step": 25484 + }, + { + "epoch": 2.798704151109159, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3669626712799072, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7129786610603333, + "num_tokens": 659534891.0, + "step": 25485 + }, + { + "epoch": 2.7988139688117726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.320058822631836, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7225887179374695, + "num_tokens": 659563039.0, + "step": 25486 + }, + { + "epoch": 2.7989237865143863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4966115951538086, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7049474716186523, + "num_tokens": 659587458.0, + "step": 25487 + }, + { + "epoch": 2.7990336042169996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.480847120285034, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7202804684638977, + "num_tokens": 659610684.0, + "step": 25488 + }, + { + "epoch": 2.7991434219196134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5938453674316406, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7391604781150818, + "num_tokens": 659631616.0, + "step": 25489 + }, + { + "epoch": 2.799253239622227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.483860969543457, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.7044264674186707, + "num_tokens": 659659811.0, + "step": 25490 + }, + { + "epoch": 2.799363057324841, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6748626232147217, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7332077026367188, + "num_tokens": 659681866.0, + "step": 25491 + }, + { + "epoch": 2.7994728750274547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2825679779052734, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7122851610183716, + "num_tokens": 659708529.0, + "step": 25492 + }, + { + "epoch": 2.799582692730068, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2510833740234375, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.719552218914032, + "num_tokens": 659737239.0, + "step": 25493 + }, + { + "epoch": 2.7996925104326817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.710430860519409, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7245434522628784, + "num_tokens": 659758319.0, + "step": 25494 + }, + { + "epoch": 2.7998023281352955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5377349853515625, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7099415063858032, + "num_tokens": 659781992.0, + "step": 25495 + }, + { + "epoch": 2.799912145837909, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8159263134002686, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7116209864616394, + "num_tokens": 659803204.0, + "step": 25496 + }, + { + "epoch": 2.8000219635405226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5024759769439697, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7126772403717041, + "num_tokens": 659827306.0, + "step": 25497 + }, + { + "epoch": 2.8001317812431363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.664860963821411, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7166737914085388, + "num_tokens": 659847819.0, + "step": 25498 + }, + { + "epoch": 2.80024159894575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.550363540649414, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7072034478187561, + "num_tokens": 659871600.0, + "step": 25499 + }, + { + "epoch": 2.800351416648364, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.251497983932495, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7220777273178101, + "num_tokens": 659900805.0, + "step": 25500 + }, + { + "epoch": 2.800461234350977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2146291732788086, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7102674841880798, + "num_tokens": 659929084.0, + "step": 25501 + }, + { + "epoch": 2.800571052053591, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.153477907180786, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7025201320648193, + "num_tokens": 659961327.0, + "step": 25502 + }, + { + "epoch": 2.8006808697562047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4864416122436523, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.69265216588974, + "num_tokens": 659986817.0, + "step": 25503 + }, + { + "epoch": 2.8007906874588184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.740734577178955, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7053934335708618, + "num_tokens": 660008085.0, + "step": 25504 + }, + { + "epoch": 2.800900505161432, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3354544639587402, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7302378416061401, + "num_tokens": 660034970.0, + "step": 25505 + }, + { + "epoch": 2.8010103228640455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6592352390289307, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7249643802642822, + "num_tokens": 660056318.0, + "step": 25506 + }, + { + "epoch": 2.8011201405666593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5150442123413086, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7339276075363159, + "num_tokens": 660079209.0, + "step": 25507 + }, + { + "epoch": 2.801229958269273, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.382924795150757, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6910342574119568, + "num_tokens": 660107530.0, + "step": 25508 + }, + { + "epoch": 2.8013397759718868, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.965766191482544, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7228541970252991, + "num_tokens": 660131087.0, + "step": 25509 + }, + { + "epoch": 2.8014495936745005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3011233806610107, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.7012293338775635, + "num_tokens": 660162306.0, + "step": 25510 + }, + { + "epoch": 2.801559411377114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.466505289077759, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7249020338058472, + "num_tokens": 660186485.0, + "step": 25511 + }, + { + "epoch": 2.8016692290797276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.374136209487915, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7109891772270203, + "num_tokens": 660210493.0, + "step": 25512 + }, + { + "epoch": 2.8017790467823414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.504835367202759, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6968667507171631, + "num_tokens": 660233809.0, + "step": 25513 + }, + { + "epoch": 2.801888864484955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2192001342773438, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7155129313468933, + "num_tokens": 660262261.0, + "step": 25514 + }, + { + "epoch": 2.801998682187569, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.606902599334717, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7268462181091309, + "num_tokens": 660285282.0, + "step": 25515 + }, + { + "epoch": 2.802108499890182, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414740800857544, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7204995155334473, + "num_tokens": 660312026.0, + "step": 25516 + }, + { + "epoch": 2.802218317592796, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5854225158691406, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7103250622749329, + "num_tokens": 660336797.0, + "step": 25517 + }, + { + "epoch": 2.8023281352954097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.354811668395996, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7165913581848145, + "num_tokens": 660363648.0, + "step": 25518 + }, + { + "epoch": 2.802437952998023, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.559708595275879, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7118510603904724, + "num_tokens": 660388352.0, + "step": 25519 + }, + { + "epoch": 2.802547770700637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5220720767974854, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7110906839370728, + "num_tokens": 660412677.0, + "step": 25520 + }, + { + "epoch": 2.8026575884032505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.255028486251831, + "learning_rate": 1e-06, + "loss": 1.078, + "mean_token_accuracy": 0.690017819404602, + "num_tokens": 660442384.0, + "step": 25521 + }, + { + "epoch": 2.8027674061058643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.250641345977783, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7284280061721802, + "num_tokens": 660471277.0, + "step": 25522 + }, + { + "epoch": 2.802877223808478, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.310878276824951, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7140452861785889, + "num_tokens": 660499776.0, + "step": 25523 + }, + { + "epoch": 2.8029870415110913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2590744495391846, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7030748724937439, + "num_tokens": 660529542.0, + "step": 25524 + }, + { + "epoch": 2.803096859213705, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3266067504882812, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7101961970329285, + "num_tokens": 660557255.0, + "step": 25525 + }, + { + "epoch": 2.803206676916319, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6543655395507812, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7244764566421509, + "num_tokens": 660577448.0, + "step": 25526 + }, + { + "epoch": 2.8033164946189326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.131439685821533, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7047528028488159, + "num_tokens": 660611802.0, + "step": 25527 + }, + { + "epoch": 2.8034263123215464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8189024925231934, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7443258762359619, + "num_tokens": 660632030.0, + "step": 25528 + }, + { + "epoch": 2.8035361300241597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.473097801208496, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7006392478942871, + "num_tokens": 660656917.0, + "step": 25529 + }, + { + "epoch": 2.8036459477267734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8237037658691406, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7351295351982117, + "num_tokens": 660677591.0, + "step": 25530 + }, + { + "epoch": 2.803755765429387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.33682918548584, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7312314510345459, + "num_tokens": 660704066.0, + "step": 25531 + }, + { + "epoch": 2.803865583132001, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.125962257385254, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.7021999359130859, + "num_tokens": 660736158.0, + "step": 25532 + }, + { + "epoch": 2.8039754008346147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.322657585144043, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7235469818115234, + "num_tokens": 660764931.0, + "step": 25533 + }, + { + "epoch": 2.804085218537228, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.44874906539917, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7014892101287842, + "num_tokens": 660791878.0, + "step": 25534 + }, + { + "epoch": 2.804195036239842, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417325496673584, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7258995175361633, + "num_tokens": 660818303.0, + "step": 25535 + }, + { + "epoch": 2.8043048539424555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2225093841552734, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7111905217170715, + "num_tokens": 660847686.0, + "step": 25536 + }, + { + "epoch": 2.8044146716450693, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464069128036499, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7139381766319275, + "num_tokens": 660874618.0, + "step": 25537 + }, + { + "epoch": 2.804524489347683, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6150002479553223, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7360843420028687, + "num_tokens": 660895920.0, + "step": 25538 + }, + { + "epoch": 2.8046343070502964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6917026042938232, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7207743525505066, + "num_tokens": 660918930.0, + "step": 25539 + }, + { + "epoch": 2.80474412475291, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.022275686264038, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7364559173583984, + "num_tokens": 660946257.0, + "step": 25540 + }, + { + "epoch": 2.804853942455524, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5731022357940674, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.703788161277771, + "num_tokens": 660970851.0, + "step": 25541 + }, + { + "epoch": 2.8049637601581376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2658944129943848, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6852148771286011, + "num_tokens": 661001842.0, + "step": 25542 + }, + { + "epoch": 2.8050735778607514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3431386947631836, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.70859694480896, + "num_tokens": 661033093.0, + "step": 25543 + }, + { + "epoch": 2.8051833955633647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.595654249191284, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.724470317363739, + "num_tokens": 661055272.0, + "step": 25544 + }, + { + "epoch": 2.8052932132659785, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3981499671936035, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.718286395072937, + "num_tokens": 661080432.0, + "step": 25545 + }, + { + "epoch": 2.8054030309685922, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5472869873046875, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7250820398330688, + "num_tokens": 661105508.0, + "step": 25546 + }, + { + "epoch": 2.8055128486712055, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3153235912323, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.711014449596405, + "num_tokens": 661135946.0, + "step": 25547 + }, + { + "epoch": 2.8056226663738193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6520228385925293, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7097636461257935, + "num_tokens": 661159454.0, + "step": 25548 + }, + { + "epoch": 2.805732484076433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4669766426086426, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7468376159667969, + "num_tokens": 661182410.0, + "step": 25549 + }, + { + "epoch": 2.805842301779047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.847996234893799, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7279711961746216, + "num_tokens": 661201229.0, + "step": 25550 + }, + { + "epoch": 2.8059521194816606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3201799392700195, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6958298683166504, + "num_tokens": 661229079.0, + "step": 25551 + }, + { + "epoch": 2.806061937184274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5513782501220703, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7156777381896973, + "num_tokens": 661252597.0, + "step": 25552 + }, + { + "epoch": 2.8061717548868876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.326403856277466, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6902259588241577, + "num_tokens": 661280899.0, + "step": 25553 + }, + { + "epoch": 2.8062815725895014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.447324514389038, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7162529230117798, + "num_tokens": 661309258.0, + "step": 25554 + }, + { + "epoch": 2.806391390292115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.590759754180908, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6938418745994568, + "num_tokens": 661334630.0, + "step": 25555 + }, + { + "epoch": 2.806501207994729, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3795387744903564, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7199591398239136, + "num_tokens": 661361697.0, + "step": 25556 + }, + { + "epoch": 2.806611025697342, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5005416870117188, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7163723707199097, + "num_tokens": 661387248.0, + "step": 25557 + }, + { + "epoch": 2.806720843399956, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4869964122772217, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7110354900360107, + "num_tokens": 661412647.0, + "step": 25558 + }, + { + "epoch": 2.8068306611025697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3981988430023193, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7124900817871094, + "num_tokens": 661438448.0, + "step": 25559 + }, + { + "epoch": 2.8069404788051835, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5295374393463135, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7167682647705078, + "num_tokens": 661463863.0, + "step": 25560 + }, + { + "epoch": 2.8070502965077972, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3115296363830566, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7077293992042542, + "num_tokens": 661492452.0, + "step": 25561 + }, + { + "epoch": 2.8071601142104106, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5873589515686035, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7240819931030273, + "num_tokens": 661514392.0, + "step": 25562 + }, + { + "epoch": 2.8072699319130243, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2821285724639893, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7273005247116089, + "num_tokens": 661540750.0, + "step": 25563 + }, + { + "epoch": 2.807379749615638, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5047547817230225, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6972032785415649, + "num_tokens": 661568038.0, + "step": 25564 + }, + { + "epoch": 2.807489567318252, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.471315383911133, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7206102609634399, + "num_tokens": 661594930.0, + "step": 25565 + }, + { + "epoch": 2.8075993850208656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2064337730407715, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7059927582740784, + "num_tokens": 661623558.0, + "step": 25566 + }, + { + "epoch": 2.807709202723479, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.264240026473999, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.71824049949646, + "num_tokens": 661652206.0, + "step": 25567 + }, + { + "epoch": 2.8078190204260927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.351858615875244, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7041534185409546, + "num_tokens": 661683089.0, + "step": 25568 + }, + { + "epoch": 2.8079288381287064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3921923637390137, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7347460985183716, + "num_tokens": 661707272.0, + "step": 25569 + }, + { + "epoch": 2.80803865583132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.565343141555786, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7271019816398621, + "num_tokens": 661730598.0, + "step": 25570 + }, + { + "epoch": 2.808148473533934, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.671722173690796, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7305101156234741, + "num_tokens": 661753754.0, + "step": 25571 + }, + { + "epoch": 2.8082582912365472, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4124088287353516, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7202746868133545, + "num_tokens": 661779277.0, + "step": 25572 + }, + { + "epoch": 2.808368108939161, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.548051357269287, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.721366822719574, + "num_tokens": 661804152.0, + "step": 25573 + }, + { + "epoch": 2.8084779266417748, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6075446605682373, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6985688209533691, + "num_tokens": 661826690.0, + "step": 25574 + }, + { + "epoch": 2.808587744344388, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2220237255096436, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.7036548852920532, + "num_tokens": 661858275.0, + "step": 25575 + }, + { + "epoch": 2.808697562047002, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.484203815460205, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7025653719902039, + "num_tokens": 661885693.0, + "step": 25576 + }, + { + "epoch": 2.8088073797496156, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.37265944480896, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7099897861480713, + "num_tokens": 661910365.0, + "step": 25577 + }, + { + "epoch": 2.8089171974522293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.675776243209839, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7314496040344238, + "num_tokens": 661930340.0, + "step": 25578 + }, + { + "epoch": 2.809027015154843, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.723933458328247, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7162293791770935, + "num_tokens": 661950143.0, + "step": 25579 + }, + { + "epoch": 2.8091368328574564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.856201648712158, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7296867370605469, + "num_tokens": 661972186.0, + "step": 25580 + }, + { + "epoch": 2.80924665056007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2947304248809814, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7216313481330872, + "num_tokens": 661998612.0, + "step": 25581 + }, + { + "epoch": 2.809356468262684, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3636937141418457, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7188035249710083, + "num_tokens": 662027212.0, + "step": 25582 + }, + { + "epoch": 2.8094662859652977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4373598098754883, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7235693335533142, + "num_tokens": 662052504.0, + "step": 25583 + }, + { + "epoch": 2.8095761036679114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.535339593887329, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7361352443695068, + "num_tokens": 662075237.0, + "step": 25584 + }, + { + "epoch": 2.8096859213705248, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3782455921173096, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7090574502944946, + "num_tokens": 662100237.0, + "step": 25585 + }, + { + "epoch": 2.8097957390731385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4383132457733154, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7433743476867676, + "num_tokens": 662125680.0, + "step": 25586 + }, + { + "epoch": 2.8099055567757523, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.782759666442871, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7078996896743774, + "num_tokens": 662146244.0, + "step": 25587 + }, + { + "epoch": 2.810015374478366, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2690110206604004, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7206135988235474, + "num_tokens": 662177330.0, + "step": 25588 + }, + { + "epoch": 2.81012519218098, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.713812828063965, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7328813672065735, + "num_tokens": 662197208.0, + "step": 25589 + }, + { + "epoch": 2.810235009883593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.83091139793396, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7498197555541992, + "num_tokens": 662216227.0, + "step": 25590 + }, + { + "epoch": 2.810344827586207, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4817798137664795, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7057305574417114, + "num_tokens": 662241684.0, + "step": 25591 + }, + { + "epoch": 2.8104546452888206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.64672589302063, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7209354639053345, + "num_tokens": 662262461.0, + "step": 25592 + }, + { + "epoch": 2.8105644629914344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469693183898926, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7222743034362793, + "num_tokens": 662287571.0, + "step": 25593 + }, + { + "epoch": 2.810674280694048, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3555688858032227, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7119839787483215, + "num_tokens": 662317521.0, + "step": 25594 + }, + { + "epoch": 2.8107840983966614, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.302241325378418, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6944146752357483, + "num_tokens": 662346423.0, + "step": 25595 + }, + { + "epoch": 2.810893916099275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.709700107574463, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7098484635353088, + "num_tokens": 662367305.0, + "step": 25596 + }, + { + "epoch": 2.811003733801889, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.448676586151123, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7152673006057739, + "num_tokens": 662392026.0, + "step": 25597 + }, + { + "epoch": 2.8111135515045023, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.459721803665161, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7333366870880127, + "num_tokens": 662417973.0, + "step": 25598 + }, + { + "epoch": 2.8112233692071165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2477409839630127, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7190271615982056, + "num_tokens": 662449961.0, + "step": 25599 + }, + { + "epoch": 2.8113331869097298, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5737853050231934, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.7084301114082336, + "num_tokens": 662474543.0, + "step": 25600 + }, + { + "epoch": 2.8114430046123435, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400503635406494, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6973643898963928, + "num_tokens": 662503803.0, + "step": 25601 + }, + { + "epoch": 2.8115528223149573, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0831234455108643, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7165008783340454, + "num_tokens": 662538210.0, + "step": 25602 + }, + { + "epoch": 2.8116626400175706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6182034015655518, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7400863766670227, + "num_tokens": 662559687.0, + "step": 25603 + }, + { + "epoch": 2.8117724577201844, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.479595184326172, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7344516515731812, + "num_tokens": 662583448.0, + "step": 25604 + }, + { + "epoch": 2.811882275422798, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3882858753204346, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7113680839538574, + "num_tokens": 662608705.0, + "step": 25605 + }, + { + "epoch": 2.811992093125412, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.805490732192993, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6914986371994019, + "num_tokens": 662630715.0, + "step": 25606 + }, + { + "epoch": 2.8121019108280256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3281052112579346, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7031377553939819, + "num_tokens": 662657962.0, + "step": 25607 + }, + { + "epoch": 2.812211728530639, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6611456871032715, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.717732846736908, + "num_tokens": 662684548.0, + "step": 25608 + }, + { + "epoch": 2.8123215462332527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.418851852416992, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.728286623954773, + "num_tokens": 662710245.0, + "step": 25609 + }, + { + "epoch": 2.8124313639358665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5048298835754395, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7222362756729126, + "num_tokens": 662735937.0, + "step": 25610 + }, + { + "epoch": 2.81254118163848, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.784531593322754, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7181719541549683, + "num_tokens": 662755845.0, + "step": 25611 + }, + { + "epoch": 2.812650999341094, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5204994678497314, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7082540392875671, + "num_tokens": 662781038.0, + "step": 25612 + }, + { + "epoch": 2.8127608170437073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5848119258880615, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7122607827186584, + "num_tokens": 662805811.0, + "step": 25613 + }, + { + "epoch": 2.812870634746321, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6290407180786133, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7297356724739075, + "num_tokens": 662826943.0, + "step": 25614 + }, + { + "epoch": 2.812980452448935, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3008158206939697, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7096030116081238, + "num_tokens": 662854053.0, + "step": 25615 + }, + { + "epoch": 2.8130902701515486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3235158920288086, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7021187543869019, + "num_tokens": 662882986.0, + "step": 25616 + }, + { + "epoch": 2.8132000878541623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.26474666595459, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7159115672111511, + "num_tokens": 662913476.0, + "step": 25617 + }, + { + "epoch": 2.8133099055567756, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.369278907775879, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7276722192764282, + "num_tokens": 662940969.0, + "step": 25618 + }, + { + "epoch": 2.8134197232593894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1935081481933594, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7313342094421387, + "num_tokens": 662969481.0, + "step": 25619 + }, + { + "epoch": 2.813529540962003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8233540058135986, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7059385776519775, + "num_tokens": 662990900.0, + "step": 25620 + }, + { + "epoch": 2.813639358664617, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3358800411224365, + "learning_rate": 1e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6859289407730103, + "num_tokens": 663025348.0, + "step": 25621 + }, + { + "epoch": 2.8137491763672307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277679681777954, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7141119837760925, + "num_tokens": 663055529.0, + "step": 25622 + }, + { + "epoch": 2.813858994069844, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.357990264892578, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7254371643066406, + "num_tokens": 663082171.0, + "step": 25623 + }, + { + "epoch": 2.8139688117724577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.701084852218628, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7249640822410583, + "num_tokens": 663102817.0, + "step": 25624 + }, + { + "epoch": 2.8140786294750715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4677107334136963, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7068044543266296, + "num_tokens": 663126762.0, + "step": 25625 + }, + { + "epoch": 2.814188447177685, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.636080026626587, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.717886209487915, + "num_tokens": 663149265.0, + "step": 25626 + }, + { + "epoch": 2.8142982648802986, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6480324268341064, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7311713099479675, + "num_tokens": 663171640.0, + "step": 25627 + }, + { + "epoch": 2.8144080825829123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.093101978302002, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7250840663909912, + "num_tokens": 663205859.0, + "step": 25628 + }, + { + "epoch": 2.814517900285526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.496273994445801, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7379448413848877, + "num_tokens": 663229362.0, + "step": 25629 + }, + { + "epoch": 2.81462771798814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.582268476486206, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7330809235572815, + "num_tokens": 663251463.0, + "step": 25630 + }, + { + "epoch": 2.814737535690753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5637569427490234, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7099313735961914, + "num_tokens": 663275502.0, + "step": 25631 + }, + { + "epoch": 2.814847353393367, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2470126152038574, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7110609412193298, + "num_tokens": 663305986.0, + "step": 25632 + }, + { + "epoch": 2.8149571710959806, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.452152729034424, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7402085065841675, + "num_tokens": 663330428.0, + "step": 25633 + }, + { + "epoch": 2.8150669887985944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4990923404693604, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7145135402679443, + "num_tokens": 663356398.0, + "step": 25634 + }, + { + "epoch": 2.815176806501208, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7588253021240234, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7137718796730042, + "num_tokens": 663376647.0, + "step": 25635 + }, + { + "epoch": 2.8152866242038215, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7058916091918945, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.722146213054657, + "num_tokens": 663399079.0, + "step": 25636 + }, + { + "epoch": 2.8153964419064352, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3899953365325928, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7238771319389343, + "num_tokens": 663424783.0, + "step": 25637 + }, + { + "epoch": 2.815506259609049, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.195173501968384, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7047457098960876, + "num_tokens": 663456579.0, + "step": 25638 + }, + { + "epoch": 2.8156160773116627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4582602977752686, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7040116190910339, + "num_tokens": 663481581.0, + "step": 25639 + }, + { + "epoch": 2.8157258950142765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.531196355819702, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7178027629852295, + "num_tokens": 663504593.0, + "step": 25640 + }, + { + "epoch": 2.81583571271689, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.424002170562744, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7235881090164185, + "num_tokens": 663529251.0, + "step": 25641 + }, + { + "epoch": 2.8159455304195036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.382986545562744, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.738077700138092, + "num_tokens": 663553956.0, + "step": 25642 + }, + { + "epoch": 2.8160553481221173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.546036958694458, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7248082160949707, + "num_tokens": 663575866.0, + "step": 25643 + }, + { + "epoch": 2.816165165824731, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1698191165924072, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7076658010482788, + "num_tokens": 663609903.0, + "step": 25644 + }, + { + "epoch": 2.816274983527345, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3847105503082275, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.714556097984314, + "num_tokens": 663637331.0, + "step": 25645 + }, + { + "epoch": 2.816384801229958, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.304182767868042, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.722232460975647, + "num_tokens": 663667723.0, + "step": 25646 + }, + { + "epoch": 2.816494618932572, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5086166858673096, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7068840265274048, + "num_tokens": 663691961.0, + "step": 25647 + }, + { + "epoch": 2.8166044366351857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8663346767425537, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7416446208953857, + "num_tokens": 663712363.0, + "step": 25648 + }, + { + "epoch": 2.816714254337799, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4066274166107178, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7014373540878296, + "num_tokens": 663744232.0, + "step": 25649 + }, + { + "epoch": 2.816824072040413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.284874677658081, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7099257111549377, + "num_tokens": 663771402.0, + "step": 25650 + }, + { + "epoch": 2.8169338897430265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3245437145233154, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7137666940689087, + "num_tokens": 663798841.0, + "step": 25651 + }, + { + "epoch": 2.8170437074456403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2687039375305176, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.70951247215271, + "num_tokens": 663825270.0, + "step": 25652 + }, + { + "epoch": 2.817153525148254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0913209915161133, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6997807025909424, + "num_tokens": 663856581.0, + "step": 25653 + }, + { + "epoch": 2.8172633428508673, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.323554515838623, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7394042015075684, + "num_tokens": 663882578.0, + "step": 25654 + }, + { + "epoch": 2.817373160553481, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3655080795288086, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7087628841400146, + "num_tokens": 663908507.0, + "step": 25655 + }, + { + "epoch": 2.817482978256095, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.386500358581543, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7231287956237793, + "num_tokens": 663934012.0, + "step": 25656 + }, + { + "epoch": 2.8175927959587086, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.028653144836426, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7046101093292236, + "num_tokens": 663971194.0, + "step": 25657 + }, + { + "epoch": 2.8177026136613224, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.452622652053833, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7285140752792358, + "num_tokens": 663996352.0, + "step": 25658 + }, + { + "epoch": 2.8178124313639357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7339835166931152, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.7039515972137451, + "num_tokens": 664016776.0, + "step": 25659 + }, + { + "epoch": 2.8179222490665494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3798739910125732, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7420436143875122, + "num_tokens": 664042536.0, + "step": 25660 + }, + { + "epoch": 2.818032066769163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3481903076171875, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7409366369247437, + "num_tokens": 664070228.0, + "step": 25661 + }, + { + "epoch": 2.818141884471777, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.632418632507324, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7266825437545776, + "num_tokens": 664093475.0, + "step": 25662 + }, + { + "epoch": 2.8182517021743907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6889986991882324, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7159556150436401, + "num_tokens": 664114673.0, + "step": 25663 + }, + { + "epoch": 2.818361519877004, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4287924766540527, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7158070802688599, + "num_tokens": 664139000.0, + "step": 25664 + }, + { + "epoch": 2.8184713375796178, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2867751121520996, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7053140997886658, + "num_tokens": 664169452.0, + "step": 25665 + }, + { + "epoch": 2.8185811552822315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.681389808654785, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7212247848510742, + "num_tokens": 664190664.0, + "step": 25666 + }, + { + "epoch": 2.8186909729848453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4707119464874268, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7134414911270142, + "num_tokens": 664214414.0, + "step": 25667 + }, + { + "epoch": 2.818800790687459, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2372887134552, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7148432731628418, + "num_tokens": 664243671.0, + "step": 25668 + }, + { + "epoch": 2.8189106083900723, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3439383506774902, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7030124068260193, + "num_tokens": 664272682.0, + "step": 25669 + }, + { + "epoch": 2.819020426092686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4524483680725098, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7150445580482483, + "num_tokens": 664297167.0, + "step": 25670 + }, + { + "epoch": 2.8191302437953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.272272825241089, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7023566961288452, + "num_tokens": 664327436.0, + "step": 25671 + }, + { + "epoch": 2.8192400614979136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393416404724121, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7140934467315674, + "num_tokens": 664355794.0, + "step": 25672 + }, + { + "epoch": 2.8193498792005274, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3200314044952393, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6920559406280518, + "num_tokens": 664385123.0, + "step": 25673 + }, + { + "epoch": 2.8194596969031407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.398075819015503, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.747798502445221, + "num_tokens": 664399166.0, + "step": 25674 + }, + { + "epoch": 2.8195695146057544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.350724220275879, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.7038984298706055, + "num_tokens": 664426734.0, + "step": 25675 + }, + { + "epoch": 2.819679332308368, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539072036743164, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7303268909454346, + "num_tokens": 664449383.0, + "step": 25676 + }, + { + "epoch": 2.8197891500109815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.209984540939331, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7218281030654907, + "num_tokens": 664478943.0, + "step": 25677 + }, + { + "epoch": 2.8198989677135953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.479044198989868, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7100167274475098, + "num_tokens": 664504458.0, + "step": 25678 + }, + { + "epoch": 2.820008785416209, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.147796154022217, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7022018432617188, + "num_tokens": 664535890.0, + "step": 25679 + }, + { + "epoch": 2.820118603118823, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4918081760406494, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7229691743850708, + "num_tokens": 664558345.0, + "step": 25680 + }, + { + "epoch": 2.8202284208214365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.535553455352783, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7182735204696655, + "num_tokens": 664580299.0, + "step": 25681 + }, + { + "epoch": 2.82033823852405, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8207035064697266, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7185522317886353, + "num_tokens": 664599077.0, + "step": 25682 + }, + { + "epoch": 2.8204480562266636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5089662075042725, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7170133590698242, + "num_tokens": 664624121.0, + "step": 25683 + }, + { + "epoch": 2.8205578739292774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0197737216949463, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7181246876716614, + "num_tokens": 664660951.0, + "step": 25684 + }, + { + "epoch": 2.820667691631891, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5696938037872314, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7406439781188965, + "num_tokens": 664684410.0, + "step": 25685 + }, + { + "epoch": 2.820777509334505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6490061283111572, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7170614004135132, + "num_tokens": 664706851.0, + "step": 25686 + }, + { + "epoch": 2.820887327037118, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.505030632019043, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7106794118881226, + "num_tokens": 664731863.0, + "step": 25687 + }, + { + "epoch": 2.820997144739732, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7764315605163574, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7242788672447205, + "num_tokens": 664752228.0, + "step": 25688 + }, + { + "epoch": 2.8211069624423457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5283725261688232, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7467178106307983, + "num_tokens": 664773933.0, + "step": 25689 + }, + { + "epoch": 2.8212167801449595, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2380454540252686, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7178027629852295, + "num_tokens": 664802609.0, + "step": 25690 + }, + { + "epoch": 2.8213265978475732, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9219937324523926, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7072398066520691, + "num_tokens": 664821200.0, + "step": 25691 + }, + { + "epoch": 2.8214364155501865, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324486255645752, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7026950120925903, + "num_tokens": 664848426.0, + "step": 25692 + }, + { + "epoch": 2.8215462332528003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2542436122894287, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7199575901031494, + "num_tokens": 664874057.0, + "step": 25693 + }, + { + "epoch": 2.821656050955414, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.663449764251709, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7168555855751038, + "num_tokens": 664897031.0, + "step": 25694 + }, + { + "epoch": 2.821765868658028, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4827115535736084, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6872861385345459, + "num_tokens": 664922382.0, + "step": 25695 + }, + { + "epoch": 2.8218756863606416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.558119535446167, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7145822048187256, + "num_tokens": 664946332.0, + "step": 25696 + }, + { + "epoch": 2.821985504063255, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430185556411743, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7173144817352295, + "num_tokens": 664970614.0, + "step": 25697 + }, + { + "epoch": 2.8220953217658686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.36567759513855, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7094892263412476, + "num_tokens": 664997642.0, + "step": 25698 + }, + { + "epoch": 2.8222051394684824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2132835388183594, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7268998026847839, + "num_tokens": 665025423.0, + "step": 25699 + }, + { + "epoch": 2.8223149571710957, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.350496530532837, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7116376757621765, + "num_tokens": 665053357.0, + "step": 25700 + }, + { + "epoch": 2.82242477487371, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4791221618652344, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7218153476715088, + "num_tokens": 665078191.0, + "step": 25701 + }, + { + "epoch": 2.822534592576323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6357812881469727, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7382675409317017, + "num_tokens": 665100123.0, + "step": 25702 + }, + { + "epoch": 2.822644410278937, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.378352403640747, + "learning_rate": 1e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6822261214256287, + "num_tokens": 665130091.0, + "step": 25703 + }, + { + "epoch": 2.8227542279815507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.007002353668213, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7188032865524292, + "num_tokens": 665148247.0, + "step": 25704 + }, + { + "epoch": 2.822864045684164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2097034454345703, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6956403851509094, + "num_tokens": 665178671.0, + "step": 25705 + }, + { + "epoch": 2.822973863386778, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2734224796295166, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7166904211044312, + "num_tokens": 665206942.0, + "step": 25706 + }, + { + "epoch": 2.8230836810893916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7340030670166016, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.711012065410614, + "num_tokens": 665228016.0, + "step": 25707 + }, + { + "epoch": 2.8231934987920053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.542226552963257, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.711033284664154, + "num_tokens": 665252564.0, + "step": 25708 + }, + { + "epoch": 2.823303316494619, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396416425704956, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6979489922523499, + "num_tokens": 665279318.0, + "step": 25709 + }, + { + "epoch": 2.8234131341972324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.15131950378418, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7254608869552612, + "num_tokens": 665305386.0, + "step": 25710 + }, + { + "epoch": 2.823522951899846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5962672233581543, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7576102018356323, + "num_tokens": 665327383.0, + "step": 25711 + }, + { + "epoch": 2.82363276960246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.351229667663574, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7007249593734741, + "num_tokens": 665356505.0, + "step": 25712 + }, + { + "epoch": 2.8237425873050737, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 2.490619421005249, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6974472403526306, + "num_tokens": 665382325.0, + "step": 25713 + }, + { + "epoch": 2.8238524050076874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3086516857147217, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7268251776695251, + "num_tokens": 665408629.0, + "step": 25714 + }, + { + "epoch": 2.8239622227103007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.534944534301758, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7090645432472229, + "num_tokens": 665436323.0, + "step": 25715 + }, + { + "epoch": 2.8240720404129145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2932486534118652, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7537865042686462, + "num_tokens": 665463777.0, + "step": 25716 + }, + { + "epoch": 2.8241818581155282, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.617846727371216, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7170532941818237, + "num_tokens": 665485836.0, + "step": 25717 + }, + { + "epoch": 2.824291675818142, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4602279663085938, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.7076271176338196, + "num_tokens": 665510241.0, + "step": 25718 + }, + { + "epoch": 2.8244014935207558, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3162434101104736, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6964545249938965, + "num_tokens": 665539111.0, + "step": 25719 + }, + { + "epoch": 2.824511311223369, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2372214794158936, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6939014196395874, + "num_tokens": 665572159.0, + "step": 25720 + }, + { + "epoch": 2.824621128925983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3613364696502686, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7025940418243408, + "num_tokens": 665600417.0, + "step": 25721 + }, + { + "epoch": 2.8247309466285966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5426599979400635, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.706993043422699, + "num_tokens": 665625111.0, + "step": 25722 + }, + { + "epoch": 2.8248407643312103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.817761182785034, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7205629348754883, + "num_tokens": 665645082.0, + "step": 25723 + }, + { + "epoch": 2.824950582033824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.219973087310791, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7000339031219482, + "num_tokens": 665675922.0, + "step": 25724 + }, + { + "epoch": 2.8250603997364374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4740583896636963, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7289144396781921, + "num_tokens": 665701672.0, + "step": 25725 + }, + { + "epoch": 2.825170217439051, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5043256282806396, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.725606381893158, + "num_tokens": 665726620.0, + "step": 25726 + }, + { + "epoch": 2.825280035141665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3241026401519775, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6979027986526489, + "num_tokens": 665752886.0, + "step": 25727 + }, + { + "epoch": 2.8253898528442782, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.596301794052124, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7081400156021118, + "num_tokens": 665776049.0, + "step": 25728 + }, + { + "epoch": 2.825499670546892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4095654487609863, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7184993624687195, + "num_tokens": 665801166.0, + "step": 25729 + }, + { + "epoch": 2.8256094882495058, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2671496868133545, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6933940649032593, + "num_tokens": 665829909.0, + "step": 25730 + }, + { + "epoch": 2.8257193059521195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7240800857543945, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7180585265159607, + "num_tokens": 665851708.0, + "step": 25731 + }, + { + "epoch": 2.8258291236547333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.568282127380371, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7081136703491211, + "num_tokens": 665875189.0, + "step": 25732 + }, + { + "epoch": 2.8259389413573466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5202531814575195, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.706513524055481, + "num_tokens": 665897904.0, + "step": 25733 + }, + { + "epoch": 2.8260487590599603, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4908409118652344, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7359529733657837, + "num_tokens": 665921091.0, + "step": 25734 + }, + { + "epoch": 2.826158576762574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.254223346710205, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7312061786651611, + "num_tokens": 665948538.0, + "step": 25735 + }, + { + "epoch": 2.826268394465188, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5491392612457275, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7048969864845276, + "num_tokens": 665974966.0, + "step": 25736 + }, + { + "epoch": 2.8263782121678016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.752803325653076, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7237687110900879, + "num_tokens": 665994760.0, + "step": 25737 + }, + { + "epoch": 2.826488029870415, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498107671737671, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7113877534866333, + "num_tokens": 666021963.0, + "step": 25738 + }, + { + "epoch": 2.8265978475730287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2299506664276123, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7172219753265381, + "num_tokens": 666050709.0, + "step": 25739 + }, + { + "epoch": 2.8267076652756424, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1826484203338623, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7012578845024109, + "num_tokens": 666080470.0, + "step": 25740 + }, + { + "epoch": 2.826817482978256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3973424434661865, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7059727311134338, + "num_tokens": 666105655.0, + "step": 25741 + }, + { + "epoch": 2.82692730068087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5918798446655273, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7152671217918396, + "num_tokens": 666129224.0, + "step": 25742 + }, + { + "epoch": 2.8270371183834833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417789936065674, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7154334783554077, + "num_tokens": 666153247.0, + "step": 25743 + }, + { + "epoch": 2.827146936086097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.695758819580078, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7133488655090332, + "num_tokens": 666175056.0, + "step": 25744 + }, + { + "epoch": 2.8272567537887108, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1782333850860596, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7069425582885742, + "num_tokens": 666207477.0, + "step": 25745 + }, + { + "epoch": 2.8273665714913245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.482581377029419, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7082344889640808, + "num_tokens": 666233368.0, + "step": 25746 + }, + { + "epoch": 2.8274763891939383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5110697746276855, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7117214202880859, + "num_tokens": 666258159.0, + "step": 25747 + }, + { + "epoch": 2.8275862068965516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.459186315536499, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7142153978347778, + "num_tokens": 666284219.0, + "step": 25748 + }, + { + "epoch": 2.8276960245991654, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6851325035095215, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7207585573196411, + "num_tokens": 666305589.0, + "step": 25749 + }, + { + "epoch": 2.827805842301779, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7004520893096924, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6991275548934937, + "num_tokens": 666330272.0, + "step": 25750 + }, + { + "epoch": 2.827915660004393, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.42500901222229, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7188175916671753, + "num_tokens": 666356032.0, + "step": 25751 + }, + { + "epoch": 2.8280254777070066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.137350559234619, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7341781258583069, + "num_tokens": 666372562.0, + "step": 25752 + }, + { + "epoch": 2.82813529540962, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7379560470581055, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7176498174667358, + "num_tokens": 666395419.0, + "step": 25753 + }, + { + "epoch": 2.8282451131122337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3860082626342773, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.745511531829834, + "num_tokens": 666420506.0, + "step": 25754 + }, + { + "epoch": 2.8283549308148475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265071153640747, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6949860453605652, + "num_tokens": 666450363.0, + "step": 25755 + }, + { + "epoch": 2.8284647485174608, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5864956378936768, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.724970817565918, + "num_tokens": 666472731.0, + "step": 25756 + }, + { + "epoch": 2.8285745662200745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4373958110809326, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7063751220703125, + "num_tokens": 666498792.0, + "step": 25757 + }, + { + "epoch": 2.8286843839226883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3450353145599365, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7034480571746826, + "num_tokens": 666526752.0, + "step": 25758 + }, + { + "epoch": 2.828794201625302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6240651607513428, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7427952885627747, + "num_tokens": 666548945.0, + "step": 25759 + }, + { + "epoch": 2.828904019327916, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.781794548034668, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.704897940158844, + "num_tokens": 666569835.0, + "step": 25760 + }, + { + "epoch": 2.829013837030529, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5145885944366455, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7167763710021973, + "num_tokens": 666593958.0, + "step": 25761 + }, + { + "epoch": 2.829123654733143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5367839336395264, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7074543237686157, + "num_tokens": 666618422.0, + "step": 25762 + }, + { + "epoch": 2.8292334724357566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8532190322875977, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7374484539031982, + "num_tokens": 666639019.0, + "step": 25763 + }, + { + "epoch": 2.8293432901383704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.838059425354004, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.71328204870224, + "num_tokens": 666662132.0, + "step": 25764 + }, + { + "epoch": 2.829453107840984, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9200024604797363, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.726718544960022, + "num_tokens": 666680745.0, + "step": 25765 + }, + { + "epoch": 2.8295629255435975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.357069969177246, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7312312722206116, + "num_tokens": 666708013.0, + "step": 25766 + }, + { + "epoch": 2.829672743246211, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.432366371154785, + "learning_rate": 1e-06, + "loss": 1.1025, + "mean_token_accuracy": 0.6778863668441772, + "num_tokens": 666738869.0, + "step": 25767 + }, + { + "epoch": 2.829782560948825, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3774521350860596, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7199510335922241, + "num_tokens": 666764646.0, + "step": 25768 + }, + { + "epoch": 2.8298923786514387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426744222640991, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6994773149490356, + "num_tokens": 666792890.0, + "step": 25769 + }, + { + "epoch": 2.8300021963540525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4803903102874756, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7264771461486816, + "num_tokens": 666817196.0, + "step": 25770 + }, + { + "epoch": 2.830112014056666, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.692094326019287, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7520513534545898, + "num_tokens": 666837965.0, + "step": 25771 + }, + { + "epoch": 2.8302218317592795, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.53944993019104, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.7096279263496399, + "num_tokens": 666861412.0, + "step": 25772 + }, + { + "epoch": 2.8303316494618933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.868516683578491, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7080910205841064, + "num_tokens": 666883947.0, + "step": 25773 + }, + { + "epoch": 2.830441467164507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6634206771850586, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7364670038223267, + "num_tokens": 666904341.0, + "step": 25774 + }, + { + "epoch": 2.830551284867121, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.572422742843628, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7216189503669739, + "num_tokens": 666926188.0, + "step": 25775 + }, + { + "epoch": 2.830661102569734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.683887004852295, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.718925952911377, + "num_tokens": 666950234.0, + "step": 25776 + }, + { + "epoch": 2.830770920272348, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3822715282440186, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7095341682434082, + "num_tokens": 666976412.0, + "step": 25777 + }, + { + "epoch": 2.8308807379749616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.391788959503174, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6890172958374023, + "num_tokens": 667005457.0, + "step": 25778 + }, + { + "epoch": 2.830990555677575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6463658809661865, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7218490839004517, + "num_tokens": 667028380.0, + "step": 25779 + }, + { + "epoch": 2.831100373380189, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1722331047058105, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6979783773422241, + "num_tokens": 667059785.0, + "step": 25780 + }, + { + "epoch": 2.8312101910828025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.575136184692383, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7260463237762451, + "num_tokens": 667083512.0, + "step": 25781 + }, + { + "epoch": 2.8313200087854162, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.707750082015991, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7463111877441406, + "num_tokens": 667104526.0, + "step": 25782 + }, + { + "epoch": 2.83142982648803, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8023104667663574, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7011287212371826, + "num_tokens": 667125859.0, + "step": 25783 + }, + { + "epoch": 2.8315396441906433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.44218373298645, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7187662124633789, + "num_tokens": 667151027.0, + "step": 25784 + }, + { + "epoch": 2.831649461893257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1744816303253174, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7322266101837158, + "num_tokens": 667186120.0, + "step": 25785 + }, + { + "epoch": 2.831759279595871, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6547374725341797, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7086312174797058, + "num_tokens": 667209178.0, + "step": 25786 + }, + { + "epoch": 2.8318690972984846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4604170322418213, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.725877583026886, + "num_tokens": 667233824.0, + "step": 25787 + }, + { + "epoch": 2.8319789150010983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6913392543792725, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7269186973571777, + "num_tokens": 667255284.0, + "step": 25788 + }, + { + "epoch": 2.8320887327037116, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2564594745635986, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6975612044334412, + "num_tokens": 667284434.0, + "step": 25789 + }, + { + "epoch": 2.8321985504063254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5037930011749268, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7110499143600464, + "num_tokens": 667307963.0, + "step": 25790 + }, + { + "epoch": 2.832308368108939, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6456804275512695, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7185896635055542, + "num_tokens": 667330400.0, + "step": 25791 + }, + { + "epoch": 2.832418185811553, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4882302284240723, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7032231092453003, + "num_tokens": 667353480.0, + "step": 25792 + }, + { + "epoch": 2.8325280035141667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.440208911895752, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7274270057678223, + "num_tokens": 667378113.0, + "step": 25793 + }, + { + "epoch": 2.83263782121678, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.252134323120117, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.737602949142456, + "num_tokens": 667405448.0, + "step": 25794 + }, + { + "epoch": 2.8327476389193937, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3523130416870117, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7328938245773315, + "num_tokens": 667431979.0, + "step": 25795 + }, + { + "epoch": 2.8328574566220075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.565782308578491, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.72806715965271, + "num_tokens": 667454702.0, + "step": 25796 + }, + { + "epoch": 2.8329672743246213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7390198707580566, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7082850337028503, + "num_tokens": 667476501.0, + "step": 25797 + }, + { + "epoch": 2.833077092027235, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4678618907928467, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7141860723495483, + "num_tokens": 667502412.0, + "step": 25798 + }, + { + "epoch": 2.8331869097298483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.827653646469116, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7388226985931396, + "num_tokens": 667521563.0, + "step": 25799 + }, + { + "epoch": 2.833296727432462, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4839093685150146, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7126545906066895, + "num_tokens": 667548691.0, + "step": 25800 + }, + { + "epoch": 2.833406545135076, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5928051471710205, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.7009483575820923, + "num_tokens": 667575700.0, + "step": 25801 + }, + { + "epoch": 2.8335163628376896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5926334857940674, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7149274349212646, + "num_tokens": 667599917.0, + "step": 25802 + }, + { + "epoch": 2.8336261805403034, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.369860887527466, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.7010242938995361, + "num_tokens": 667630484.0, + "step": 25803 + }, + { + "epoch": 2.8337359982429167, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1405210494995117, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6911309957504272, + "num_tokens": 667662177.0, + "step": 25804 + }, + { + "epoch": 2.8338458159455304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426719903945923, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6980148553848267, + "num_tokens": 667687948.0, + "step": 25805 + }, + { + "epoch": 2.833955633648144, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5897374153137207, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7107299566268921, + "num_tokens": 667711620.0, + "step": 25806 + }, + { + "epoch": 2.8340654513507575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7251293659210205, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7384604811668396, + "num_tokens": 667733656.0, + "step": 25807 + }, + { + "epoch": 2.8341752690533712, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1722826957702637, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.707285463809967, + "num_tokens": 667766584.0, + "step": 25808 + }, + { + "epoch": 2.834285086755985, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4266319274902344, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.699247419834137, + "num_tokens": 667795665.0, + "step": 25809 + }, + { + "epoch": 2.8343949044585988, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.530460834503174, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7167861461639404, + "num_tokens": 667820599.0, + "step": 25810 + }, + { + "epoch": 2.8345047221612125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3689446449279785, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7244417071342468, + "num_tokens": 667849042.0, + "step": 25811 + }, + { + "epoch": 2.834614539863826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.293278932571411, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7355790138244629, + "num_tokens": 667877786.0, + "step": 25812 + }, + { + "epoch": 2.8347243575664396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.715707540512085, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7302789688110352, + "num_tokens": 667900474.0, + "step": 25813 + }, + { + "epoch": 2.8348341752690533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3529021739959717, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7220256924629211, + "num_tokens": 667930778.0, + "step": 25814 + }, + { + "epoch": 2.834943992971667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.433678388595581, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7354373931884766, + "num_tokens": 667956985.0, + "step": 25815 + }, + { + "epoch": 2.835053810674281, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.655512809753418, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7423958778381348, + "num_tokens": 667976498.0, + "step": 25816 + }, + { + "epoch": 2.835163628376894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.606715679168701, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6880045533180237, + "num_tokens": 667999562.0, + "step": 25817 + }, + { + "epoch": 2.835273446079508, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.505547523498535, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7123591899871826, + "num_tokens": 668024673.0, + "step": 25818 + }, + { + "epoch": 2.8353832637821217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.849905252456665, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7295643091201782, + "num_tokens": 668044984.0, + "step": 25819 + }, + { + "epoch": 2.8354930814847354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9309332370758057, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7319915294647217, + "num_tokens": 668062676.0, + "step": 25820 + }, + { + "epoch": 2.835602899187349, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1846299171447754, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.69606614112854, + "num_tokens": 668095964.0, + "step": 25821 + }, + { + "epoch": 2.8357127168899625, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.497777223587036, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.710852861404419, + "num_tokens": 668120610.0, + "step": 25822 + }, + { + "epoch": 2.8358225345925763, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 32.331993103027344, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6915072798728943, + "num_tokens": 668154666.0, + "step": 25823 + }, + { + "epoch": 2.83593235229519, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6572184562683105, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.699442982673645, + "num_tokens": 668179455.0, + "step": 25824 + }, + { + "epoch": 2.836042169997804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4540321826934814, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7295964956283569, + "num_tokens": 668204743.0, + "step": 25825 + }, + { + "epoch": 2.8361519877004175, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464526414871216, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7160668969154358, + "num_tokens": 668229429.0, + "step": 25826 + }, + { + "epoch": 2.836261805403031, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4942033290863037, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7312138080596924, + "num_tokens": 668252642.0, + "step": 25827 + }, + { + "epoch": 2.8363716231056446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.607905387878418, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7172752022743225, + "num_tokens": 668274927.0, + "step": 25828 + }, + { + "epoch": 2.8364814408082584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6223526000976562, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7126955986022949, + "num_tokens": 668300529.0, + "step": 25829 + }, + { + "epoch": 2.8365912585108717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.568221092224121, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.726370632648468, + "num_tokens": 668323105.0, + "step": 25830 + }, + { + "epoch": 2.836701076213486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.776888132095337, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7165287137031555, + "num_tokens": 668343240.0, + "step": 25831 + }, + { + "epoch": 2.836810893916099, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.585714340209961, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.7066168785095215, + "num_tokens": 668366235.0, + "step": 25832 + }, + { + "epoch": 2.836920711618713, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5192413330078125, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6941055059432983, + "num_tokens": 668390790.0, + "step": 25833 + }, + { + "epoch": 2.8370305293213267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4134488105773926, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.710612416267395, + "num_tokens": 668417367.0, + "step": 25834 + }, + { + "epoch": 2.83714034702394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.889223337173462, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7229753732681274, + "num_tokens": 668436434.0, + "step": 25835 + }, + { + "epoch": 2.837250164726554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4437291622161865, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7311481833457947, + "num_tokens": 668462158.0, + "step": 25836 + }, + { + "epoch": 2.8373599824291675, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6264986991882324, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.710573673248291, + "num_tokens": 668484016.0, + "step": 25837 + }, + { + "epoch": 2.8374698001317813, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5884735584259033, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7108151316642761, + "num_tokens": 668509697.0, + "step": 25838 + }, + { + "epoch": 2.837579617834395, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5497000217437744, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7163909673690796, + "num_tokens": 668534839.0, + "step": 25839 + }, + { + "epoch": 2.8376894355370084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6713368892669678, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7113064527511597, + "num_tokens": 668558902.0, + "step": 25840 + }, + { + "epoch": 2.837799253239622, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3548789024353027, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.7026341557502747, + "num_tokens": 668589644.0, + "step": 25841 + }, + { + "epoch": 2.837909070942236, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1440722942352295, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7025056481361389, + "num_tokens": 668623296.0, + "step": 25842 + }, + { + "epoch": 2.8380188886448496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.182173728942871, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6984269618988037, + "num_tokens": 668652015.0, + "step": 25843 + }, + { + "epoch": 2.8381287063474634, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3004109859466553, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7071565389633179, + "num_tokens": 668681762.0, + "step": 25844 + }, + { + "epoch": 2.8382385240500767, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3149666786193848, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7399218082427979, + "num_tokens": 668706177.0, + "step": 25845 + }, + { + "epoch": 2.8383483417526905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.915137767791748, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7441893219947815, + "num_tokens": 668724696.0, + "step": 25846 + }, + { + "epoch": 2.838458159455304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4900827407836914, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7285850048065186, + "num_tokens": 668748726.0, + "step": 25847 + }, + { + "epoch": 2.838567977157918, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.657327175140381, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.730832576751709, + "num_tokens": 668771087.0, + "step": 25848 + }, + { + "epoch": 2.8386777948605317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.553966522216797, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.7059429883956909, + "num_tokens": 668796119.0, + "step": 25849 + }, + { + "epoch": 2.838787612563145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.508110761642456, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7143504619598389, + "num_tokens": 668821536.0, + "step": 25850 + }, + { + "epoch": 2.838897430265759, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4611048698425293, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7151029109954834, + "num_tokens": 668849118.0, + "step": 25851 + }, + { + "epoch": 2.8390072479683726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.410306453704834, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6958580613136292, + "num_tokens": 668878656.0, + "step": 25852 + }, + { + "epoch": 2.8391170656709863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7862956523895264, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7482466697692871, + "num_tokens": 668897345.0, + "step": 25853 + }, + { + "epoch": 2.8392268833736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.537916898727417, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7079050540924072, + "num_tokens": 668921684.0, + "step": 25854 + }, + { + "epoch": 2.8393367010762134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337956666946411, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7117441296577454, + "num_tokens": 668948081.0, + "step": 25855 + }, + { + "epoch": 2.839446518778827, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3009307384490967, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6854792237281799, + "num_tokens": 668978585.0, + "step": 25856 + }, + { + "epoch": 2.839556336481441, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.201385736465454, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6969132423400879, + "num_tokens": 669009970.0, + "step": 25857 + }, + { + "epoch": 2.839666154184054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4273831844329834, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7073350548744202, + "num_tokens": 669036112.0, + "step": 25858 + }, + { + "epoch": 2.839775971886668, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4198219776153564, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7195343971252441, + "num_tokens": 669062067.0, + "step": 25859 + }, + { + "epoch": 2.8398857895892817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3785390853881836, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7288370132446289, + "num_tokens": 669089372.0, + "step": 25860 + }, + { + "epoch": 2.8399956072918955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.390561580657959, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7182470560073853, + "num_tokens": 669115387.0, + "step": 25861 + }, + { + "epoch": 2.8401054249945092, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5475313663482666, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7067505717277527, + "num_tokens": 669139887.0, + "step": 25862 + }, + { + "epoch": 2.8402152426971226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.378469228744507, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6924065351486206, + "num_tokens": 669169332.0, + "step": 25863 + }, + { + "epoch": 2.8403250603997363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.727937698364258, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7121065258979797, + "num_tokens": 669193611.0, + "step": 25864 + }, + { + "epoch": 2.84043487810235, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5113704204559326, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7339338064193726, + "num_tokens": 669217802.0, + "step": 25865 + }, + { + "epoch": 2.840544695804964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5474586486816406, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7292917370796204, + "num_tokens": 669241755.0, + "step": 25866 + }, + { + "epoch": 2.8406545135075776, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7631995677948, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6908568143844604, + "num_tokens": 669263832.0, + "step": 25867 + }, + { + "epoch": 2.840764331210191, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.506303548812866, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7263753414154053, + "num_tokens": 669286704.0, + "step": 25868 + }, + { + "epoch": 2.8408741489128047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3903210163116455, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.715214729309082, + "num_tokens": 669313161.0, + "step": 25869 + }, + { + "epoch": 2.8409839666154184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8097774982452393, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.7022700309753418, + "num_tokens": 669334300.0, + "step": 25870 + }, + { + "epoch": 2.841093784318032, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3372907638549805, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7141542434692383, + "num_tokens": 669362793.0, + "step": 25871 + }, + { + "epoch": 2.841203602020646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3097143173217773, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.7141621112823486, + "num_tokens": 669391541.0, + "step": 25872 + }, + { + "epoch": 2.8413134197232592, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.375842809677124, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7231407165527344, + "num_tokens": 669418882.0, + "step": 25873 + }, + { + "epoch": 2.841423237425873, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6439199447631836, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7034612894058228, + "num_tokens": 669441562.0, + "step": 25874 + }, + { + "epoch": 2.8415330551284868, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.436244249343872, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6874582767486572, + "num_tokens": 669468471.0, + "step": 25875 + }, + { + "epoch": 2.8416428728311005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5280959606170654, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7024360299110413, + "num_tokens": 669493183.0, + "step": 25876 + }, + { + "epoch": 2.8417526905337143, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.686551570892334, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7090927958488464, + "num_tokens": 669516414.0, + "step": 25877 + }, + { + "epoch": 2.8418625082363276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.471687078475952, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6975796818733215, + "num_tokens": 669542733.0, + "step": 25878 + }, + { + "epoch": 2.8419723259389413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.528810739517212, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.726791501045227, + "num_tokens": 669566008.0, + "step": 25879 + }, + { + "epoch": 2.842082143641555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.458120346069336, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7169786691665649, + "num_tokens": 669593634.0, + "step": 25880 + }, + { + "epoch": 2.8421919613441684, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.307178258895874, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7220202684402466, + "num_tokens": 669621603.0, + "step": 25881 + }, + { + "epoch": 2.8423017790467826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5551624298095703, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7239379286766052, + "num_tokens": 669644770.0, + "step": 25882 + }, + { + "epoch": 2.842411596749396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.48168683052063, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.716934323310852, + "num_tokens": 669669975.0, + "step": 25883 + }, + { + "epoch": 2.8425214144520097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.572517156600952, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6899898648262024, + "num_tokens": 669693620.0, + "step": 25884 + }, + { + "epoch": 2.8426312321546234, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3752901554107666, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7108601331710815, + "num_tokens": 669720800.0, + "step": 25885 + }, + { + "epoch": 2.8427410498572367, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6458587646484375, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7361576557159424, + "num_tokens": 669743147.0, + "step": 25886 + }, + { + "epoch": 2.8428508675598505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3771097660064697, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.705124020576477, + "num_tokens": 669773579.0, + "step": 25887 + }, + { + "epoch": 2.8429606852624643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.356793165206909, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7058542966842651, + "num_tokens": 669801002.0, + "step": 25888 + }, + { + "epoch": 2.843070502965078, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4990570545196533, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7347768545150757, + "num_tokens": 669823367.0, + "step": 25889 + }, + { + "epoch": 2.8431803206676918, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3153069019317627, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7158234119415283, + "num_tokens": 669853913.0, + "step": 25890 + }, + { + "epoch": 2.843290138370305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.293795347213745, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6999468803405762, + "num_tokens": 669885033.0, + "step": 25891 + }, + { + "epoch": 2.843399956072919, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.282432794570923, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7080251574516296, + "num_tokens": 669915585.0, + "step": 25892 + }, + { + "epoch": 2.8435097737755326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5142786502838135, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7089910507202148, + "num_tokens": 669940756.0, + "step": 25893 + }, + { + "epoch": 2.8436195914781464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6317460536956787, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7255075573921204, + "num_tokens": 669961158.0, + "step": 25894 + }, + { + "epoch": 2.84372940918076, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5830111503601074, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7164366841316223, + "num_tokens": 669984561.0, + "step": 25895 + }, + { + "epoch": 2.8438392268833734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495802640914917, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.712871789932251, + "num_tokens": 670008493.0, + "step": 25896 + }, + { + "epoch": 2.843949044585987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3304290771484375, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.7004352807998657, + "num_tokens": 670033800.0, + "step": 25897 + }, + { + "epoch": 2.844058862288601, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.311596393585205, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7110757231712341, + "num_tokens": 670060141.0, + "step": 25898 + }, + { + "epoch": 2.8441686799912147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3676018714904785, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.723983883857727, + "num_tokens": 670087456.0, + "step": 25899 + }, + { + "epoch": 2.8442784976938285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.238945245742798, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.713925838470459, + "num_tokens": 670118073.0, + "step": 25900 + }, + { + "epoch": 2.8443883153964418, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4443376064300537, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7174358367919922, + "num_tokens": 670142476.0, + "step": 25901 + }, + { + "epoch": 2.8444981330990555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3865199089050293, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7135545015335083, + "num_tokens": 670168641.0, + "step": 25902 + }, + { + "epoch": 2.8446079508016693, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2478885650634766, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.7077467441558838, + "num_tokens": 670201852.0, + "step": 25903 + }, + { + "epoch": 2.844717768504283, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5939784049987793, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7297916412353516, + "num_tokens": 670224372.0, + "step": 25904 + }, + { + "epoch": 2.844827586206897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3265762329101562, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7148798704147339, + "num_tokens": 670252213.0, + "step": 25905 + }, + { + "epoch": 2.84493740390951, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.387807846069336, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.7003612518310547, + "num_tokens": 670280173.0, + "step": 25906 + }, + { + "epoch": 2.845047221612124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.196298837661743, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.730901837348938, + "num_tokens": 670310822.0, + "step": 25907 + }, + { + "epoch": 2.8451570393147376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1806282997131348, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7221030592918396, + "num_tokens": 670338498.0, + "step": 25908 + }, + { + "epoch": 2.845266857017351, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6631057262420654, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7449120879173279, + "num_tokens": 670360003.0, + "step": 25909 + }, + { + "epoch": 2.8453766747199647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1715736389160156, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7070482969284058, + "num_tokens": 670392819.0, + "step": 25910 + }, + { + "epoch": 2.8454864924225785, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3702399730682373, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7043278217315674, + "num_tokens": 670420464.0, + "step": 25911 + }, + { + "epoch": 2.845596310125192, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1906394958496094, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6905677318572998, + "num_tokens": 670451696.0, + "step": 25912 + }, + { + "epoch": 2.845706127827806, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.386359691619873, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7095025777816772, + "num_tokens": 670480806.0, + "step": 25913 + }, + { + "epoch": 2.8458159455304193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.77005934715271, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7230485081672668, + "num_tokens": 670502025.0, + "step": 25914 + }, + { + "epoch": 2.845925763233033, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2313153743743896, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7109463810920715, + "num_tokens": 670533258.0, + "step": 25915 + }, + { + "epoch": 2.846035580935647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4616732597351074, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7125219106674194, + "num_tokens": 670560018.0, + "step": 25916 + }, + { + "epoch": 2.8461453986382605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3602538108825684, + "learning_rate": 1e-06, + "loss": 1.091, + "mean_token_accuracy": 0.6856051087379456, + "num_tokens": 670590028.0, + "step": 25917 + }, + { + "epoch": 2.8462552163408743, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5549418926239014, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7302524447441101, + "num_tokens": 670615101.0, + "step": 25918 + }, + { + "epoch": 2.8463650340434876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2204978466033936, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7296496033668518, + "num_tokens": 670646292.0, + "step": 25919 + }, + { + "epoch": 2.8464748517461014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5369455814361572, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7198513746261597, + "num_tokens": 670667824.0, + "step": 25920 + }, + { + "epoch": 2.846584669448715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.667128086090088, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7148505449295044, + "num_tokens": 670691045.0, + "step": 25921 + }, + { + "epoch": 2.846694487151329, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1457533836364746, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7112438678741455, + "num_tokens": 670722992.0, + "step": 25922 + }, + { + "epoch": 2.8468043048539426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2522952556610107, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7090035676956177, + "num_tokens": 670751400.0, + "step": 25923 + }, + { + "epoch": 2.846914122556556, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2155158519744873, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6926298141479492, + "num_tokens": 670780341.0, + "step": 25924 + }, + { + "epoch": 2.8470239402591697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2879347801208496, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7083238363265991, + "num_tokens": 670809210.0, + "step": 25925 + }, + { + "epoch": 2.8471337579617835, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.267465591430664, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7062885761260986, + "num_tokens": 670837346.0, + "step": 25926 + }, + { + "epoch": 2.8472435756643972, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.170306921005249, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7291423678398132, + "num_tokens": 670866039.0, + "step": 25927 + }, + { + "epoch": 2.847353393367011, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5584964752197266, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7131264209747314, + "num_tokens": 670889048.0, + "step": 25928 + }, + { + "epoch": 2.8474632110696243, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.290009021759033, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6874433755874634, + "num_tokens": 670918102.0, + "step": 25929 + }, + { + "epoch": 2.847573028772238, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4880523681640625, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7182302474975586, + "num_tokens": 670944479.0, + "step": 25930 + }, + { + "epoch": 2.847682846474852, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.323380470275879, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7193524837493896, + "num_tokens": 670972970.0, + "step": 25931 + }, + { + "epoch": 2.8477926641774656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7873356342315674, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7059745788574219, + "num_tokens": 670997133.0, + "step": 25932 + }, + { + "epoch": 2.8479024818800793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.751782178878784, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7071247100830078, + "num_tokens": 671023043.0, + "step": 25933 + }, + { + "epoch": 2.8480122995826926, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621396780014038, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.705971360206604, + "num_tokens": 671046145.0, + "step": 25934 + }, + { + "epoch": 2.8481221172853064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4288973808288574, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6904981136322021, + "num_tokens": 671073990.0, + "step": 25935 + }, + { + "epoch": 2.84823193498792, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.574979782104492, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7292548418045044, + "num_tokens": 671096700.0, + "step": 25936 + }, + { + "epoch": 2.8483417526905335, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4605252742767334, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7053543329238892, + "num_tokens": 671121925.0, + "step": 25937 + }, + { + "epoch": 2.8484515703931472, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.247533082962036, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7140858173370361, + "num_tokens": 671149683.0, + "step": 25938 + }, + { + "epoch": 2.848561388095761, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4797990322113037, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7130069732666016, + "num_tokens": 671174455.0, + "step": 25939 + }, + { + "epoch": 2.8486712057983747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.446607828140259, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7392651438713074, + "num_tokens": 671197544.0, + "step": 25940 + }, + { + "epoch": 2.8487810235009885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3001558780670166, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6974719166755676, + "num_tokens": 671227099.0, + "step": 25941 + }, + { + "epoch": 2.848890841203602, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8834359645843506, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7358697652816772, + "num_tokens": 671247278.0, + "step": 25942 + }, + { + "epoch": 2.8490006589062156, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.764730215072632, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7060505151748657, + "num_tokens": 671270021.0, + "step": 25943 + }, + { + "epoch": 2.8491104766088293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4170916080474854, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7302271127700806, + "num_tokens": 671297070.0, + "step": 25944 + }, + { + "epoch": 2.849220294311443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2724297046661377, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7029608488082886, + "num_tokens": 671327388.0, + "step": 25945 + }, + { + "epoch": 2.849330112014057, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3053269386291504, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7100598812103271, + "num_tokens": 671357515.0, + "step": 25946 + }, + { + "epoch": 2.84943992971667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3440957069396973, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7256511449813843, + "num_tokens": 671383917.0, + "step": 25947 + }, + { + "epoch": 2.849549747419284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.782158613204956, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7195552587509155, + "num_tokens": 671405249.0, + "step": 25948 + }, + { + "epoch": 2.8496595651218977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464475154876709, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6990081071853638, + "num_tokens": 671434360.0, + "step": 25949 + }, + { + "epoch": 2.8497693828245114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2011916637420654, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7125266790390015, + "num_tokens": 671463945.0, + "step": 25950 + }, + { + "epoch": 2.849879200527125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4119791984558105, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7311777472496033, + "num_tokens": 671488614.0, + "step": 25951 + }, + { + "epoch": 2.8499890182297385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.453831911087036, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7107720971107483, + "num_tokens": 671516910.0, + "step": 25952 + }, + { + "epoch": 2.8500988359323522, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.543238639831543, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7194956541061401, + "num_tokens": 671542002.0, + "step": 25953 + }, + { + "epoch": 2.850208653634966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.734525203704834, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7099934816360474, + "num_tokens": 671564639.0, + "step": 25954 + }, + { + "epoch": 2.8503184713375798, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5998103618621826, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.707380473613739, + "num_tokens": 671591875.0, + "step": 25955 + }, + { + "epoch": 2.8504282890401935, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.554980516433716, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7019676566123962, + "num_tokens": 671618230.0, + "step": 25956 + }, + { + "epoch": 2.850538106742807, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.178485155105591, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7154434323310852, + "num_tokens": 671650632.0, + "step": 25957 + }, + { + "epoch": 2.8506479244454206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.53181529045105, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7056146860122681, + "num_tokens": 671674132.0, + "step": 25958 + }, + { + "epoch": 2.8507577421480343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3943240642547607, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7014901638031006, + "num_tokens": 671701303.0, + "step": 25959 + }, + { + "epoch": 2.8508675598506477, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.377412796020508, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7002439498901367, + "num_tokens": 671731503.0, + "step": 25960 + }, + { + "epoch": 2.850977377553262, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5197787284851074, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7373493313789368, + "num_tokens": 671758936.0, + "step": 25961 + }, + { + "epoch": 2.851087195255875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.274019479751587, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7033631801605225, + "num_tokens": 671789371.0, + "step": 25962 + }, + { + "epoch": 2.851197012958489, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3833537101745605, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7213472127914429, + "num_tokens": 671815168.0, + "step": 25963 + }, + { + "epoch": 2.8513068306611027, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.59944486618042, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7424432039260864, + "num_tokens": 671835725.0, + "step": 25964 + }, + { + "epoch": 2.851416648363716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2801718711853027, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6971113085746765, + "num_tokens": 671864361.0, + "step": 25965 + }, + { + "epoch": 2.8515264660663298, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3527252674102783, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.7025375366210938, + "num_tokens": 671893204.0, + "step": 25966 + }, + { + "epoch": 2.8516362837689435, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.65187931060791, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7163132429122925, + "num_tokens": 671917250.0, + "step": 25967 + }, + { + "epoch": 2.8517461014715573, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.220905065536499, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7007452249526978, + "num_tokens": 671948757.0, + "step": 25968 + }, + { + "epoch": 2.851855919174171, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6366701126098633, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7116889357566833, + "num_tokens": 671971315.0, + "step": 25969 + }, + { + "epoch": 2.8519657368767843, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.471308469772339, + "learning_rate": 1e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.6877299547195435, + "num_tokens": 672001667.0, + "step": 25970 + }, + { + "epoch": 2.852075554579398, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.737205982208252, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7353904247283936, + "num_tokens": 672021785.0, + "step": 25971 + }, + { + "epoch": 2.852185372282012, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.398664712905884, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7168951034545898, + "num_tokens": 672047617.0, + "step": 25972 + }, + { + "epoch": 2.8522951899846256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6973235607147217, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7101332545280457, + "num_tokens": 672068906.0, + "step": 25973 + }, + { + "epoch": 2.8524050076872394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.296511650085449, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7057675123214722, + "num_tokens": 672096120.0, + "step": 25974 + }, + { + "epoch": 2.8525148253898527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3887510299682617, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7259055376052856, + "num_tokens": 672123519.0, + "step": 25975 + }, + { + "epoch": 2.8526246430924664, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.460092067718506, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7134691476821899, + "num_tokens": 672148463.0, + "step": 25976 + }, + { + "epoch": 2.85273446079508, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.375277280807495, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7479616403579712, + "num_tokens": 672175014.0, + "step": 25977 + }, + { + "epoch": 2.852844278497694, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7745628356933594, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7152532339096069, + "num_tokens": 672196646.0, + "step": 25978 + }, + { + "epoch": 2.8529540962003077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4506261348724365, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7135881185531616, + "num_tokens": 672223961.0, + "step": 25979 + }, + { + "epoch": 2.853063913902921, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2694168090820312, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7191824913024902, + "num_tokens": 672252712.0, + "step": 25980 + }, + { + "epoch": 2.853173731605535, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4861843585968018, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7156259417533875, + "num_tokens": 672276394.0, + "step": 25981 + }, + { + "epoch": 2.8532835493081485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498843193054199, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7367538213729858, + "num_tokens": 672300503.0, + "step": 25982 + }, + { + "epoch": 2.8533933670107623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.727308750152588, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.721055269241333, + "num_tokens": 672322330.0, + "step": 25983 + }, + { + "epoch": 2.853503184713376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6092798709869385, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7169421911239624, + "num_tokens": 672345273.0, + "step": 25984 + }, + { + "epoch": 2.8536130024159894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3951404094696045, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.7059125900268555, + "num_tokens": 672372616.0, + "step": 25985 + }, + { + "epoch": 2.853722820118603, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.702989101409912, + "learning_rate": 1e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7588775157928467, + "num_tokens": 672394953.0, + "step": 25986 + }, + { + "epoch": 2.853832637821217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.308701515197754, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.7043479681015015, + "num_tokens": 672423046.0, + "step": 25987 + }, + { + "epoch": 2.85394245552383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.846196413040161, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7246014475822449, + "num_tokens": 672443008.0, + "step": 25988 + }, + { + "epoch": 2.854052273226444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4913103580474854, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7174395322799683, + "num_tokens": 672467994.0, + "step": 25989 + }, + { + "epoch": 2.8541620909290577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.274202585220337, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6921573877334595, + "num_tokens": 672497637.0, + "step": 25990 + }, + { + "epoch": 2.8542719086316715, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3483664989471436, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7072793245315552, + "num_tokens": 672522773.0, + "step": 25991 + }, + { + "epoch": 2.854381726334285, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2577648162841797, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7148711681365967, + "num_tokens": 672550357.0, + "step": 25992 + }, + { + "epoch": 2.8544915440368985, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526134729385376, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7406499981880188, + "num_tokens": 672572533.0, + "step": 25993 + }, + { + "epoch": 2.8546013617395123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2564666271209717, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7150640487670898, + "num_tokens": 672599089.0, + "step": 25994 + }, + { + "epoch": 2.854711179442126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.316572666168213, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7255792021751404, + "num_tokens": 672626113.0, + "step": 25995 + }, + { + "epoch": 2.85482099714474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5280137062072754, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7097809314727783, + "num_tokens": 672651555.0, + "step": 25996 + }, + { + "epoch": 2.8549308148473536, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6370949745178223, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7058982849121094, + "num_tokens": 672674340.0, + "step": 25997 + }, + { + "epoch": 2.855040632549967, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1773571968078613, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7357823252677917, + "num_tokens": 672704680.0, + "step": 25998 + }, + { + "epoch": 2.8551504502525806, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.578061103820801, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6953916549682617, + "num_tokens": 672731569.0, + "step": 25999 + }, + { + "epoch": 2.8552602679551944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1654531955718994, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6941083669662476, + "num_tokens": 672763196.0, + "step": 26000 + }, + { + "epoch": 2.855370085657808, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.553788900375366, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7460470795631409, + "num_tokens": 672784965.0, + "step": 26001 + }, + { + "epoch": 2.855479903360422, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2520358562469482, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6887542009353638, + "num_tokens": 672816672.0, + "step": 26002 + }, + { + "epoch": 2.855589721063035, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.536621332168579, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7351897358894348, + "num_tokens": 672840746.0, + "step": 26003 + }, + { + "epoch": 2.855699538765649, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5071732997894287, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7300885915756226, + "num_tokens": 672866472.0, + "step": 26004 + }, + { + "epoch": 2.8558093564682627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2536046504974365, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7089975476264954, + "num_tokens": 672894541.0, + "step": 26005 + }, + { + "epoch": 2.8559191741708765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.50602388381958, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.719211220741272, + "num_tokens": 672919064.0, + "step": 26006 + }, + { + "epoch": 2.8560289918734902, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.338453531265259, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7102972269058228, + "num_tokens": 672946396.0, + "step": 26007 + }, + { + "epoch": 2.8561388095761036, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2503881454467773, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7095876932144165, + "num_tokens": 672975469.0, + "step": 26008 + }, + { + "epoch": 2.8562486272787173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1291327476501465, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7373958230018616, + "num_tokens": 673005393.0, + "step": 26009 + }, + { + "epoch": 2.856358444981331, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8790366649627686, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7375358939170837, + "num_tokens": 673023441.0, + "step": 26010 + }, + { + "epoch": 2.8564682626839444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3541100025177, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.741325318813324, + "num_tokens": 673049411.0, + "step": 26011 + }, + { + "epoch": 2.8565780803865586, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7749826908111572, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7190744280815125, + "num_tokens": 673071869.0, + "step": 26012 + }, + { + "epoch": 2.856687898089172, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.498234987258911, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6956243515014648, + "num_tokens": 673098439.0, + "step": 26013 + }, + { + "epoch": 2.8567977157917857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5842552185058594, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7171582579612732, + "num_tokens": 673120897.0, + "step": 26014 + }, + { + "epoch": 2.8569075334943994, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6612982749938965, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.717927098274231, + "num_tokens": 673141537.0, + "step": 26015 + }, + { + "epoch": 2.8570173511970127, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3128981590270996, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7133039236068726, + "num_tokens": 673169449.0, + "step": 26016 + }, + { + "epoch": 2.8571271688996265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3293659687042236, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7193956971168518, + "num_tokens": 673197705.0, + "step": 26017 + }, + { + "epoch": 2.8572369866022402, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.271845579147339, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6996665000915527, + "num_tokens": 673227243.0, + "step": 26018 + }, + { + "epoch": 2.857346804304854, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.565706253051758, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7044023275375366, + "num_tokens": 673252030.0, + "step": 26019 + }, + { + "epoch": 2.8574566220074678, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277268648147583, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7225525379180908, + "num_tokens": 673280208.0, + "step": 26020 + }, + { + "epoch": 2.857566439710081, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1089742183685303, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.701878547668457, + "num_tokens": 673311114.0, + "step": 26021 + }, + { + "epoch": 2.857676257412695, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3837788105010986, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7214319705963135, + "num_tokens": 673336523.0, + "step": 26022 + }, + { + "epoch": 2.8577860751153086, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.394094944000244, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.7028640508651733, + "num_tokens": 673363109.0, + "step": 26023 + }, + { + "epoch": 2.8578958928179223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.389617919921875, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7329429388046265, + "num_tokens": 673388866.0, + "step": 26024 + }, + { + "epoch": 2.858005710520536, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5576188564300537, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7145025730133057, + "num_tokens": 673413250.0, + "step": 26025 + }, + { + "epoch": 2.8581155282231494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7777457237243652, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7211662530899048, + "num_tokens": 673433658.0, + "step": 26026 + }, + { + "epoch": 2.858225345925763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5892908573150635, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7286555767059326, + "num_tokens": 673453861.0, + "step": 26027 + }, + { + "epoch": 2.858335163628377, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2651519775390625, + "learning_rate": 1e-06, + "loss": 1.1086, + "mean_token_accuracy": 0.6756171584129333, + "num_tokens": 673484844.0, + "step": 26028 + }, + { + "epoch": 2.8584449813309907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4959020614624023, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7266103625297546, + "num_tokens": 673510058.0, + "step": 26029 + }, + { + "epoch": 2.8585547990336044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3456132411956787, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.714188814163208, + "num_tokens": 673537680.0, + "step": 26030 + }, + { + "epoch": 2.8586646167362177, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.47646427154541, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7189455032348633, + "num_tokens": 673562118.0, + "step": 26031 + }, + { + "epoch": 2.8587744344388315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4139163494110107, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7330141067504883, + "num_tokens": 673587339.0, + "step": 26032 + }, + { + "epoch": 2.8588842521414453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.494093894958496, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7176930904388428, + "num_tokens": 673612062.0, + "step": 26033 + }, + { + "epoch": 2.858994069844059, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3473010063171387, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7218552827835083, + "num_tokens": 673639397.0, + "step": 26034 + }, + { + "epoch": 2.8591038875466728, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1588666439056396, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7253544330596924, + "num_tokens": 673672041.0, + "step": 26035 + }, + { + "epoch": 2.859213705249286, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.647346258163452, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.7074164152145386, + "num_tokens": 673696084.0, + "step": 26036 + }, + { + "epoch": 2.8593235229519, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.466076135635376, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7065746188163757, + "num_tokens": 673725454.0, + "step": 26037 + }, + { + "epoch": 2.8594333406545136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.750452756881714, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7208142876625061, + "num_tokens": 673746551.0, + "step": 26038 + }, + { + "epoch": 2.859543158357127, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.509258985519409, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7266714572906494, + "num_tokens": 673772122.0, + "step": 26039 + }, + { + "epoch": 2.8596529760597407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4182143211364746, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7139413952827454, + "num_tokens": 673797623.0, + "step": 26040 + }, + { + "epoch": 2.8597627937623544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2088003158569336, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7053948640823364, + "num_tokens": 673828902.0, + "step": 26041 + }, + { + "epoch": 2.859872611464968, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2969839572906494, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7110002040863037, + "num_tokens": 673859008.0, + "step": 26042 + }, + { + "epoch": 2.859982429167582, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4786078929901123, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.728507399559021, + "num_tokens": 673881917.0, + "step": 26043 + }, + { + "epoch": 2.8600922468701953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.241772174835205, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7175975441932678, + "num_tokens": 673912750.0, + "step": 26044 + }, + { + "epoch": 2.860202064572809, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5249874591827393, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7149404287338257, + "num_tokens": 673934835.0, + "step": 26045 + }, + { + "epoch": 2.8603118822754228, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3815841674804688, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7117084860801697, + "num_tokens": 673963165.0, + "step": 26046 + }, + { + "epoch": 2.8604216999780365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5966014862060547, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7239402532577515, + "num_tokens": 673984440.0, + "step": 26047 + }, + { + "epoch": 2.8605315176806503, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4022934436798096, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7275588512420654, + "num_tokens": 674009921.0, + "step": 26048 + }, + { + "epoch": 2.8606413353832636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.492820978164673, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7041775584220886, + "num_tokens": 674034168.0, + "step": 26049 + }, + { + "epoch": 2.8607511530858774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6962215900421143, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.733504593372345, + "num_tokens": 674054595.0, + "step": 26050 + }, + { + "epoch": 2.860860970788491, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.90944766998291, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7147167921066284, + "num_tokens": 674073090.0, + "step": 26051 + }, + { + "epoch": 2.860970788491105, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.248816967010498, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7113470435142517, + "num_tokens": 674102260.0, + "step": 26052 + }, + { + "epoch": 2.8610806061937186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.780662775039673, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7053236961364746, + "num_tokens": 674123703.0, + "step": 26053 + }, + { + "epoch": 2.861190423896332, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464421033859253, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6981642842292786, + "num_tokens": 674151101.0, + "step": 26054 + }, + { + "epoch": 2.8613002415989457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5441484451293945, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7010732889175415, + "num_tokens": 674176981.0, + "step": 26055 + }, + { + "epoch": 2.8614100593015594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3871099948883057, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7011334896087646, + "num_tokens": 674202164.0, + "step": 26056 + }, + { + "epoch": 2.861519877004173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5412352085113525, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7254447340965271, + "num_tokens": 674227273.0, + "step": 26057 + }, + { + "epoch": 2.861629694706787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4889557361602783, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7248591184616089, + "num_tokens": 674251889.0, + "step": 26058 + }, + { + "epoch": 2.8617395124094003, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.265674352645874, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6939895749092102, + "num_tokens": 674282868.0, + "step": 26059 + }, + { + "epoch": 2.861849330112014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4082283973693848, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.710257351398468, + "num_tokens": 674310603.0, + "step": 26060 + }, + { + "epoch": 2.861959147814628, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5642709732055664, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7168998122215271, + "num_tokens": 674333678.0, + "step": 26061 + }, + { + "epoch": 2.862068965517241, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3892972469329834, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7355902194976807, + "num_tokens": 674361458.0, + "step": 26062 + }, + { + "epoch": 2.8621787832198553, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6128101348876953, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7324478626251221, + "num_tokens": 674384351.0, + "step": 26063 + }, + { + "epoch": 2.8622886009224686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.301734447479248, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.694854736328125, + "num_tokens": 674411895.0, + "step": 26064 + }, + { + "epoch": 2.8623984186250824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3605329990386963, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.686263918876648, + "num_tokens": 674441370.0, + "step": 26065 + }, + { + "epoch": 2.862508236327696, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.15401291847229, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6947362422943115, + "num_tokens": 674473057.0, + "step": 26066 + }, + { + "epoch": 2.8626180540303094, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376267671585083, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.722148060798645, + "num_tokens": 674500205.0, + "step": 26067 + }, + { + "epoch": 2.862727871732923, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5889432430267334, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7155411839485168, + "num_tokens": 674522500.0, + "step": 26068 + }, + { + "epoch": 2.862837689435537, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2377426624298096, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6976034641265869, + "num_tokens": 674553971.0, + "step": 26069 + }, + { + "epoch": 2.8629475071381507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1686112880706787, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6984836459159851, + "num_tokens": 674583947.0, + "step": 26070 + }, + { + "epoch": 2.8630573248407645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3924927711486816, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7054799795150757, + "num_tokens": 674609286.0, + "step": 26071 + }, + { + "epoch": 2.863167142543378, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.960402250289917, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7183905243873596, + "num_tokens": 674627776.0, + "step": 26072 + }, + { + "epoch": 2.8632769602459915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4779152870178223, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7191509008407593, + "num_tokens": 674652947.0, + "step": 26073 + }, + { + "epoch": 2.8633867779486053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2599217891693115, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.7038652300834656, + "num_tokens": 674680767.0, + "step": 26074 + }, + { + "epoch": 2.863496595651219, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.379295825958252, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6999828219413757, + "num_tokens": 674706645.0, + "step": 26075 + }, + { + "epoch": 2.863606413353833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3275399208068848, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7103201150894165, + "num_tokens": 674735371.0, + "step": 26076 + }, + { + "epoch": 2.863716231056446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4027130603790283, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.7028712630271912, + "num_tokens": 674761652.0, + "step": 26077 + }, + { + "epoch": 2.86382604875906, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2105214595794678, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7244229912757874, + "num_tokens": 674790968.0, + "step": 26078 + }, + { + "epoch": 2.8639358664616736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.250647783279419, + "learning_rate": 1e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.6861016154289246, + "num_tokens": 674820329.0, + "step": 26079 + }, + { + "epoch": 2.8640456841642874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4813952445983887, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7252908945083618, + "num_tokens": 674843544.0, + "step": 26080 + }, + { + "epoch": 2.864155501866901, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.606135129928589, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7378667593002319, + "num_tokens": 674866950.0, + "step": 26081 + }, + { + "epoch": 2.8642653195695145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4414854049682617, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6863571405410767, + "num_tokens": 674892458.0, + "step": 26082 + }, + { + "epoch": 2.8643751372721282, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.745605945587158, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7156171798706055, + "num_tokens": 674912625.0, + "step": 26083 + }, + { + "epoch": 2.864484954974742, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4552364349365234, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7093874216079712, + "num_tokens": 674937917.0, + "step": 26084 + }, + { + "epoch": 2.8645947726773557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3835039138793945, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7310317754745483, + "num_tokens": 674963788.0, + "step": 26085 + }, + { + "epoch": 2.8647045903799695, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6355366706848145, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7260880470275879, + "num_tokens": 674985862.0, + "step": 26086 + }, + { + "epoch": 2.864814408082583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6892383098602295, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7153796553611755, + "num_tokens": 675006808.0, + "step": 26087 + }, + { + "epoch": 2.8649242257851966, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.391467571258545, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.717699408531189, + "num_tokens": 675033623.0, + "step": 26088 + }, + { + "epoch": 2.8650340434878103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.627229690551758, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7306827306747437, + "num_tokens": 675053083.0, + "step": 26089 + }, + { + "epoch": 2.8651438611904236, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2687225341796875, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7078083753585815, + "num_tokens": 675082485.0, + "step": 26090 + }, + { + "epoch": 2.8652536788930374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6806466579437256, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6957038640975952, + "num_tokens": 675105349.0, + "step": 26091 + }, + { + "epoch": 2.865363496595651, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4470860958099365, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.714705765247345, + "num_tokens": 675129468.0, + "step": 26092 + }, + { + "epoch": 2.865473314298265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2452261447906494, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7130277156829834, + "num_tokens": 675158711.0, + "step": 26093 + }, + { + "epoch": 2.8655831320008787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2757747173309326, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7419408559799194, + "num_tokens": 675186157.0, + "step": 26094 + }, + { + "epoch": 2.865692949703492, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.258819818496704, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7097101211547852, + "num_tokens": 675213161.0, + "step": 26095 + }, + { + "epoch": 2.8658027674061057, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.277796983718872, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7075221538543701, + "num_tokens": 675243061.0, + "step": 26096 + }, + { + "epoch": 2.8659125851087195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6072239875793457, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7282283306121826, + "num_tokens": 675267805.0, + "step": 26097 + }, + { + "epoch": 2.8660224028113332, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.763248920440674, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7318569421768188, + "num_tokens": 675288863.0, + "step": 26098 + }, + { + "epoch": 2.866132220513947, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414529800415039, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.744522750377655, + "num_tokens": 675312823.0, + "step": 26099 + }, + { + "epoch": 2.8662420382165603, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2197558879852295, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.711967408657074, + "num_tokens": 675343825.0, + "step": 26100 + }, + { + "epoch": 2.866351855919174, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6454055309295654, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7075375914573669, + "num_tokens": 675364877.0, + "step": 26101 + }, + { + "epoch": 2.866461673621788, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.358144998550415, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7158381938934326, + "num_tokens": 675390248.0, + "step": 26102 + }, + { + "epoch": 2.8665714913244016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.281365394592285, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7144321203231812, + "num_tokens": 675418776.0, + "step": 26103 + }, + { + "epoch": 2.8666813090270153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.066119909286499, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7324771881103516, + "num_tokens": 675452449.0, + "step": 26104 + }, + { + "epoch": 2.8667911267296287, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7032346725463867, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7118679285049438, + "num_tokens": 675472964.0, + "step": 26105 + }, + { + "epoch": 2.8669009444322424, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5337023735046387, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7130993604660034, + "num_tokens": 675499433.0, + "step": 26106 + }, + { + "epoch": 2.867010762134856, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.214951515197754, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6946923136711121, + "num_tokens": 675531158.0, + "step": 26107 + }, + { + "epoch": 2.86712057983747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.594683885574341, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7279186248779297, + "num_tokens": 675552905.0, + "step": 26108 + }, + { + "epoch": 2.8672303975400837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.511190176010132, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7184172868728638, + "num_tokens": 675577409.0, + "step": 26109 + }, + { + "epoch": 2.867340215242697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.499465227127075, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7293118238449097, + "num_tokens": 675600582.0, + "step": 26110 + }, + { + "epoch": 2.8674500329453108, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6435375213623047, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7374514937400818, + "num_tokens": 675623921.0, + "step": 26111 + }, + { + "epoch": 2.8675598506479245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7612340450286865, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7448869943618774, + "num_tokens": 675644555.0, + "step": 26112 + }, + { + "epoch": 2.8676696683505383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7272088527679443, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7145304679870605, + "num_tokens": 675667029.0, + "step": 26113 + }, + { + "epoch": 2.867779486053152, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.198805570602417, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.692828893661499, + "num_tokens": 675702160.0, + "step": 26114 + }, + { + "epoch": 2.8678893037557653, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5821027755737305, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7320184707641602, + "num_tokens": 675723715.0, + "step": 26115 + }, + { + "epoch": 2.867999121458379, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.438232183456421, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7095633745193481, + "num_tokens": 675749148.0, + "step": 26116 + }, + { + "epoch": 2.868108939160993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.366908073425293, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7043067812919617, + "num_tokens": 675776758.0, + "step": 26117 + }, + { + "epoch": 2.868218756863606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5321688652038574, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7095071077346802, + "num_tokens": 675800674.0, + "step": 26118 + }, + { + "epoch": 2.86832857456622, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.300367593765259, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6975933909416199, + "num_tokens": 675827345.0, + "step": 26119 + }, + { + "epoch": 2.8684383922688337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0903830528259277, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7135529518127441, + "num_tokens": 675861466.0, + "step": 26120 + }, + { + "epoch": 2.8685482099714474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5545082092285156, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7235561609268188, + "num_tokens": 675884135.0, + "step": 26121 + }, + { + "epoch": 2.868658027674061, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.488250494003296, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7150707244873047, + "num_tokens": 675908730.0, + "step": 26122 + }, + { + "epoch": 2.8687678453766745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.461129665374756, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.735741376876831, + "num_tokens": 675932288.0, + "step": 26123 + }, + { + "epoch": 2.8688776630792883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8890717029571533, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7300645112991333, + "num_tokens": 675954517.0, + "step": 26124 + }, + { + "epoch": 2.868987480781902, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3346190452575684, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7192416191101074, + "num_tokens": 675980211.0, + "step": 26125 + }, + { + "epoch": 2.869097298484516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3733627796173096, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7345197796821594, + "num_tokens": 676005009.0, + "step": 26126 + }, + { + "epoch": 2.8692071161871295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4385814666748047, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7152165174484253, + "num_tokens": 676030630.0, + "step": 26127 + }, + { + "epoch": 2.869316933889743, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3910164833068848, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7250320911407471, + "num_tokens": 676055394.0, + "step": 26128 + }, + { + "epoch": 2.8694267515923566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3542885780334473, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7249161601066589, + "num_tokens": 676081325.0, + "step": 26129 + }, + { + "epoch": 2.8695365692949704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4211905002593994, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6969698667526245, + "num_tokens": 676107605.0, + "step": 26130 + }, + { + "epoch": 2.869646386997584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.634521007537842, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7183569073677063, + "num_tokens": 676130392.0, + "step": 26131 + }, + { + "epoch": 2.869756204700198, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4149422645568848, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.709153413772583, + "num_tokens": 676155618.0, + "step": 26132 + }, + { + "epoch": 2.869866022402811, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4857242107391357, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7187992334365845, + "num_tokens": 676180193.0, + "step": 26133 + }, + { + "epoch": 2.869975840105425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5167791843414307, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6841209530830383, + "num_tokens": 676207552.0, + "step": 26134 + }, + { + "epoch": 2.8700856578080387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6260013580322266, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7129250764846802, + "num_tokens": 676231269.0, + "step": 26135 + }, + { + "epoch": 2.8701954755106525, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7662689685821533, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7345305681228638, + "num_tokens": 676251030.0, + "step": 26136 + }, + { + "epoch": 2.870305293213266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4629316329956055, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7345708608627319, + "num_tokens": 676277159.0, + "step": 26137 + }, + { + "epoch": 2.8704151109158795, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5408942699432373, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7166661620140076, + "num_tokens": 676302300.0, + "step": 26138 + }, + { + "epoch": 2.8705249286184933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.742539405822754, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7302966713905334, + "num_tokens": 676322622.0, + "step": 26139 + }, + { + "epoch": 2.870634746321107, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5742855072021484, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7023953199386597, + "num_tokens": 676347935.0, + "step": 26140 + }, + { + "epoch": 2.8707445640237204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.240882635116577, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.691973090171814, + "num_tokens": 676379047.0, + "step": 26141 + }, + { + "epoch": 2.8708543817263346, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2881999015808105, + "learning_rate": 1e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6746822595596313, + "num_tokens": 676408422.0, + "step": 26142 + }, + { + "epoch": 2.870964199428948, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4691309928894043, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.70660400390625, + "num_tokens": 676435121.0, + "step": 26143 + }, + { + "epoch": 2.8710740171315616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4368984699249268, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.7037467360496521, + "num_tokens": 676460750.0, + "step": 26144 + }, + { + "epoch": 2.8711838348341754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621598482131958, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7281449437141418, + "num_tokens": 676482900.0, + "step": 26145 + }, + { + "epoch": 2.8712936525367887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3950791358947754, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7122639417648315, + "num_tokens": 676510402.0, + "step": 26146 + }, + { + "epoch": 2.8714034702394025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1516382694244385, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7222726345062256, + "num_tokens": 676541557.0, + "step": 26147 + }, + { + "epoch": 2.871513287942016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.34264874458313, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7331225872039795, + "num_tokens": 676567699.0, + "step": 26148 + }, + { + "epoch": 2.87162310564463, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.571441411972046, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7189530730247498, + "num_tokens": 676590430.0, + "step": 26149 + }, + { + "epoch": 2.8717329233472437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3436970710754395, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7098212838172913, + "num_tokens": 676616324.0, + "step": 26150 + }, + { + "epoch": 2.871842741049857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.231358528137207, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7178438901901245, + "num_tokens": 676645348.0, + "step": 26151 + }, + { + "epoch": 2.871952558752471, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8214845657348633, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7302188277244568, + "num_tokens": 676664263.0, + "step": 26152 + }, + { + "epoch": 2.8720623764550846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400521993637085, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7213371992111206, + "num_tokens": 676689852.0, + "step": 26153 + }, + { + "epoch": 2.8721721941576983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5129153728485107, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6946130990982056, + "num_tokens": 676716524.0, + "step": 26154 + }, + { + "epoch": 2.872282011860312, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4105827808380127, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7267913818359375, + "num_tokens": 676741984.0, + "step": 26155 + }, + { + "epoch": 2.8723918295629254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2172796726226807, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.693467915058136, + "num_tokens": 676774168.0, + "step": 26156 + }, + { + "epoch": 2.872501647265539, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2286620140075684, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.7008074522018433, + "num_tokens": 676806321.0, + "step": 26157 + }, + { + "epoch": 2.872611464968153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.461972951889038, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7277361154556274, + "num_tokens": 676830146.0, + "step": 26158 + }, + { + "epoch": 2.8727212826707667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.297804117202759, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7296397089958191, + "num_tokens": 676856918.0, + "step": 26159 + }, + { + "epoch": 2.8728311003733804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.705517530441284, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.723398745059967, + "num_tokens": 676877334.0, + "step": 26160 + }, + { + "epoch": 2.8729409180759937, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5410585403442383, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7161601781845093, + "num_tokens": 676902466.0, + "step": 26161 + }, + { + "epoch": 2.8730507357786075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.646328926086426, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7200902700424194, + "num_tokens": 676923813.0, + "step": 26162 + }, + { + "epoch": 2.8731605534812212, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6367506980895996, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.717159628868103, + "num_tokens": 676946898.0, + "step": 26163 + }, + { + "epoch": 2.873270371183835, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5864145755767822, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7105684280395508, + "num_tokens": 676972284.0, + "step": 26164 + }, + { + "epoch": 2.8733801888864487, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5624475479125977, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7124104499816895, + "num_tokens": 676997213.0, + "step": 26165 + }, + { + "epoch": 2.873490006589062, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6249263286590576, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7077828645706177, + "num_tokens": 677020832.0, + "step": 26166 + }, + { + "epoch": 2.873599824291676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.597916841506958, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7040849924087524, + "num_tokens": 677044474.0, + "step": 26167 + }, + { + "epoch": 2.8737096419942896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.342958688735962, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7230485677719116, + "num_tokens": 677071226.0, + "step": 26168 + }, + { + "epoch": 2.873819459696903, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.562922239303589, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7382893562316895, + "num_tokens": 677094413.0, + "step": 26169 + }, + { + "epoch": 2.8739292773995166, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.163224458694458, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7006402015686035, + "num_tokens": 677128378.0, + "step": 26170 + }, + { + "epoch": 2.8740390951021304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7463393211364746, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7047539949417114, + "num_tokens": 677151911.0, + "step": 26171 + }, + { + "epoch": 2.874148912804744, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.270301103591919, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7279660105705261, + "num_tokens": 677180932.0, + "step": 26172 + }, + { + "epoch": 2.874258730507358, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417337417602539, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7342475652694702, + "num_tokens": 677204864.0, + "step": 26173 + }, + { + "epoch": 2.8743685482099712, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3953330516815186, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7095056772232056, + "num_tokens": 677229322.0, + "step": 26174 + }, + { + "epoch": 2.874478365912585, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.682835102081299, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7160109281539917, + "num_tokens": 677251165.0, + "step": 26175 + }, + { + "epoch": 2.8745881836151987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.677821159362793, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7141871452331543, + "num_tokens": 677276506.0, + "step": 26176 + }, + { + "epoch": 2.8746980013178125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.418081760406494, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7015102505683899, + "num_tokens": 677303683.0, + "step": 26177 + }, + { + "epoch": 2.8748078190204263, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.344301462173462, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7318415641784668, + "num_tokens": 677329909.0, + "step": 26178 + }, + { + "epoch": 2.8749176367230396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6636524200439453, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7106058597564697, + "num_tokens": 677352625.0, + "step": 26179 + }, + { + "epoch": 2.8750274544256533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.458743095397949, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7199786901473999, + "num_tokens": 677377575.0, + "step": 26180 + }, + { + "epoch": 2.875137272128267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.124812364578247, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7036550045013428, + "num_tokens": 677413143.0, + "step": 26181 + }, + { + "epoch": 2.875247089830881, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539210319519043, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7109457850456238, + "num_tokens": 677437420.0, + "step": 26182 + }, + { + "epoch": 2.8753569075334946, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.512193202972412, + "learning_rate": 1e-06, + "loss": 1.0805, + "mean_token_accuracy": 0.6878821849822998, + "num_tokens": 677460817.0, + "step": 26183 + }, + { + "epoch": 2.875466725236108, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4113941192626953, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6833196878433228, + "num_tokens": 677490990.0, + "step": 26184 + }, + { + "epoch": 2.8755765429387217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4153988361358643, + "learning_rate": 1e-06, + "loss": 1.0805, + "mean_token_accuracy": 0.682198166847229, + "num_tokens": 677519083.0, + "step": 26185 + }, + { + "epoch": 2.8756863606413354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7345900535583496, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7070308327674866, + "num_tokens": 677541676.0, + "step": 26186 + }, + { + "epoch": 2.875796178343949, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1417601108551025, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6911023855209351, + "num_tokens": 677574898.0, + "step": 26187 + }, + { + "epoch": 2.875905996046563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.627855062484741, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7080958485603333, + "num_tokens": 677597511.0, + "step": 26188 + }, + { + "epoch": 2.8760158137491763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4901626110076904, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7193163633346558, + "num_tokens": 677621779.0, + "step": 26189 + }, + { + "epoch": 2.87612563145179, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6454660892486572, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7025948166847229, + "num_tokens": 677645561.0, + "step": 26190 + }, + { + "epoch": 2.8762354491544038, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.470902442932129, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7223014831542969, + "num_tokens": 677669700.0, + "step": 26191 + }, + { + "epoch": 2.876345266857017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3611536026000977, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6958556175231934, + "num_tokens": 677698827.0, + "step": 26192 + }, + { + "epoch": 2.8764550845596313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.199070453643799, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7108529806137085, + "num_tokens": 677729257.0, + "step": 26193 + }, + { + "epoch": 2.8765649022622446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.373349666595459, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7002387642860413, + "num_tokens": 677755695.0, + "step": 26194 + }, + { + "epoch": 2.8766747199648584, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5644378662109375, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7084211111068726, + "num_tokens": 677783088.0, + "step": 26195 + }, + { + "epoch": 2.876784537667472, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2305045127868652, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7130829095840454, + "num_tokens": 677814594.0, + "step": 26196 + }, + { + "epoch": 2.8768943553700854, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.587392807006836, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7359910607337952, + "num_tokens": 677836354.0, + "step": 26197 + }, + { + "epoch": 2.877004173072699, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2457926273345947, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7253167033195496, + "num_tokens": 677865061.0, + "step": 26198 + }, + { + "epoch": 2.877113990775313, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4204256534576416, + "learning_rate": 1e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6876847743988037, + "num_tokens": 677893485.0, + "step": 26199 + }, + { + "epoch": 2.8772238084779267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443568706512451, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.709231972694397, + "num_tokens": 677917057.0, + "step": 26200 + }, + { + "epoch": 2.8773336261805404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.195793628692627, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.72661292552948, + "num_tokens": 677946716.0, + "step": 26201 + }, + { + "epoch": 2.8774434438831538, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43518328666687, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6932373046875, + "num_tokens": 677974654.0, + "step": 26202 + }, + { + "epoch": 2.8775532615857675, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396901845932007, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7040988206863403, + "num_tokens": 678000009.0, + "step": 26203 + }, + { + "epoch": 2.8776630792883813, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2742209434509277, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7187631130218506, + "num_tokens": 678027105.0, + "step": 26204 + }, + { + "epoch": 2.877772896990995, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.140611410140991, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7137081623077393, + "num_tokens": 678057760.0, + "step": 26205 + }, + { + "epoch": 2.877882714693609, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3826069831848145, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7140610218048096, + "num_tokens": 678084386.0, + "step": 26206 + }, + { + "epoch": 2.877992532396222, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4572436809539795, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7066911458969116, + "num_tokens": 678110250.0, + "step": 26207 + }, + { + "epoch": 2.878102350098836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.358098268508911, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6939526200294495, + "num_tokens": 678138519.0, + "step": 26208 + }, + { + "epoch": 2.8782121678014496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.702604055404663, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7070114016532898, + "num_tokens": 678160459.0, + "step": 26209 + }, + { + "epoch": 2.8783219855040634, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.533489227294922, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7239048480987549, + "num_tokens": 678183192.0, + "step": 26210 + }, + { + "epoch": 2.878431803206677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2781500816345215, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7164895534515381, + "num_tokens": 678212127.0, + "step": 26211 + }, + { + "epoch": 2.8785416209092904, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.306320905685425, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7192621827125549, + "num_tokens": 678240686.0, + "step": 26212 + }, + { + "epoch": 2.878651438611904, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.298877477645874, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7432432770729065, + "num_tokens": 678270121.0, + "step": 26213 + }, + { + "epoch": 2.878761256314518, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3533010482788086, + "learning_rate": 1e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6847710609436035, + "num_tokens": 678300259.0, + "step": 26214 + }, + { + "epoch": 2.8788710740171317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3941240310668945, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.7050580382347107, + "num_tokens": 678325595.0, + "step": 26215 + }, + { + "epoch": 2.8789808917197455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430553436279297, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7076526880264282, + "num_tokens": 678351866.0, + "step": 26216 + }, + { + "epoch": 2.879090709422359, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.300992965698242, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7032016515731812, + "num_tokens": 678380016.0, + "step": 26217 + }, + { + "epoch": 2.8792005271249725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.432255506515503, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7263401746749878, + "num_tokens": 678406271.0, + "step": 26218 + }, + { + "epoch": 2.8793103448275863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3360650539398193, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7225285172462463, + "num_tokens": 678433377.0, + "step": 26219 + }, + { + "epoch": 2.8794201625301996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430629253387451, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7209649085998535, + "num_tokens": 678457691.0, + "step": 26220 + }, + { + "epoch": 2.8795299802328134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.290677309036255, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6996321082115173, + "num_tokens": 678486304.0, + "step": 26221 + }, + { + "epoch": 2.879639797935427, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417175054550171, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7025797367095947, + "num_tokens": 678512215.0, + "step": 26222 + }, + { + "epoch": 2.879749615638041, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3313963413238525, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7165170311927795, + "num_tokens": 678537965.0, + "step": 26223 + }, + { + "epoch": 2.8798594333406546, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2527527809143066, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7233047485351562, + "num_tokens": 678567305.0, + "step": 26224 + }, + { + "epoch": 2.879969251043268, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.678054094314575, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7193881273269653, + "num_tokens": 678588741.0, + "step": 26225 + }, + { + "epoch": 2.8800790687458817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464003801345825, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7052381038665771, + "num_tokens": 678614744.0, + "step": 26226 + }, + { + "epoch": 2.8801888864484955, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8654191493988037, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7337707281112671, + "num_tokens": 678633366.0, + "step": 26227 + }, + { + "epoch": 2.8802987041511092, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6310155391693115, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7254974842071533, + "num_tokens": 678655254.0, + "step": 26228 + }, + { + "epoch": 2.880408521853723, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4527854919433594, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7354499101638794, + "num_tokens": 678678121.0, + "step": 26229 + }, + { + "epoch": 2.8805183395563363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.155794620513916, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7116012573242188, + "num_tokens": 678708706.0, + "step": 26230 + }, + { + "epoch": 2.88062815725895, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2956597805023193, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.701396107673645, + "num_tokens": 678736911.0, + "step": 26231 + }, + { + "epoch": 2.880737974961564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4808998107910156, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7119932174682617, + "num_tokens": 678760187.0, + "step": 26232 + }, + { + "epoch": 2.8808477926641776, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5134103298187256, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.719947338104248, + "num_tokens": 678783427.0, + "step": 26233 + }, + { + "epoch": 2.8809576103667913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.601809024810791, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7204616069793701, + "num_tokens": 678806767.0, + "step": 26234 + }, + { + "epoch": 2.8810674280694046, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6379482746124268, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7092279195785522, + "num_tokens": 678831097.0, + "step": 26235 + }, + { + "epoch": 2.8811772457720184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443544864654541, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.702894926071167, + "num_tokens": 678859565.0, + "step": 26236 + }, + { + "epoch": 2.881287063474632, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.348508358001709, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7168049812316895, + "num_tokens": 678887636.0, + "step": 26237 + }, + { + "epoch": 2.881396881177246, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3610193729400635, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.714472770690918, + "num_tokens": 678915873.0, + "step": 26238 + }, + { + "epoch": 2.8815066988798597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.365703821182251, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7120919227600098, + "num_tokens": 678941675.0, + "step": 26239 + }, + { + "epoch": 2.881616516582473, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.554858446121216, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.721674382686615, + "num_tokens": 678965197.0, + "step": 26240 + }, + { + "epoch": 2.8817263342850867, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4684271812438965, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6912425756454468, + "num_tokens": 678990892.0, + "step": 26241 + }, + { + "epoch": 2.8818361519877005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.547252655029297, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.72666996717453, + "num_tokens": 679012837.0, + "step": 26242 + }, + { + "epoch": 2.881945969690314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1715240478515625, + "learning_rate": 1e-06, + "loss": 1.0826, + "mean_token_accuracy": 0.6904078125953674, + "num_tokens": 679045032.0, + "step": 26243 + }, + { + "epoch": 2.882055787392928, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9368128776550293, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7190759181976318, + "num_tokens": 679064203.0, + "step": 26244 + }, + { + "epoch": 2.8821656050955413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.312899112701416, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7108606100082397, + "num_tokens": 679091818.0, + "step": 26245 + }, + { + "epoch": 2.882275422798155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4174187183380127, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7428681254386902, + "num_tokens": 679117480.0, + "step": 26246 + }, + { + "epoch": 2.882385240500769, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2533581256866455, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.706282377243042, + "num_tokens": 679147118.0, + "step": 26247 + }, + { + "epoch": 2.882495058203382, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5136337280273438, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7043660879135132, + "num_tokens": 679171613.0, + "step": 26248 + }, + { + "epoch": 2.882604875905996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.27367901802063, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7132014036178589, + "num_tokens": 679200715.0, + "step": 26249 + }, + { + "epoch": 2.8827146936086097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.250734567642212, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7227588295936584, + "num_tokens": 679229449.0, + "step": 26250 + }, + { + "epoch": 2.8828245113112234, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3705124855041504, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6920027732849121, + "num_tokens": 679255059.0, + "step": 26251 + }, + { + "epoch": 2.882934329013837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7145767211914062, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7209871411323547, + "num_tokens": 679274261.0, + "step": 26252 + }, + { + "epoch": 2.8830441467164505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1484618186950684, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7397800087928772, + "num_tokens": 679302410.0, + "step": 26253 + }, + { + "epoch": 2.8831539644190642, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3650431632995605, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7392421364784241, + "num_tokens": 679327577.0, + "step": 26254 + }, + { + "epoch": 2.883263782121678, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3927884101867676, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7270788550376892, + "num_tokens": 679353635.0, + "step": 26255 + }, + { + "epoch": 2.8833735998242918, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.469521999359131, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.7042884826660156, + "num_tokens": 679382008.0, + "step": 26256 + }, + { + "epoch": 2.8834834175269055, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3883395195007324, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.7030791640281677, + "num_tokens": 679410828.0, + "step": 26257 + }, + { + "epoch": 2.883593235229519, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.129263401031494, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7110422849655151, + "num_tokens": 679443533.0, + "step": 26258 + }, + { + "epoch": 2.8837030529321326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8357865810394287, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7241523265838623, + "num_tokens": 679463114.0, + "step": 26259 + }, + { + "epoch": 2.8838128706347463, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2398135662078857, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7422777414321899, + "num_tokens": 679490246.0, + "step": 26260 + }, + { + "epoch": 2.88392268833736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2734341621398926, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7163264155387878, + "num_tokens": 679517195.0, + "step": 26261 + }, + { + "epoch": 2.884032506039974, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.577141046524048, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7261533141136169, + "num_tokens": 679540984.0, + "step": 26262 + }, + { + "epoch": 2.884142323742587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.61014723777771, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7042626142501831, + "num_tokens": 679564476.0, + "step": 26263 + }, + { + "epoch": 2.884252141445201, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.417152166366577, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7147986888885498, + "num_tokens": 679591224.0, + "step": 26264 + }, + { + "epoch": 2.8843619591478147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3362648487091064, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7378726005554199, + "num_tokens": 679618867.0, + "step": 26265 + }, + { + "epoch": 2.8844717768504284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.290707588195801, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7031945586204529, + "num_tokens": 679647701.0, + "step": 26266 + }, + { + "epoch": 2.884581594553042, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8762354850769043, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7347123026847839, + "num_tokens": 679666694.0, + "step": 26267 + }, + { + "epoch": 2.8846914122556555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.470834732055664, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.728550910949707, + "num_tokens": 679690306.0, + "step": 26268 + }, + { + "epoch": 2.8848012299582693, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7309045791625977, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7104321718215942, + "num_tokens": 679710856.0, + "step": 26269 + }, + { + "epoch": 2.884911047660883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2247800827026367, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6979010701179504, + "num_tokens": 679740890.0, + "step": 26270 + }, + { + "epoch": 2.8850208653634963, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.171994924545288, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7131079435348511, + "num_tokens": 679772541.0, + "step": 26271 + }, + { + "epoch": 2.88513068306611, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.713923931121826, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7249791622161865, + "num_tokens": 679792942.0, + "step": 26272 + }, + { + "epoch": 2.885240500768724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.439488410949707, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.7014234662055969, + "num_tokens": 679821926.0, + "step": 26273 + }, + { + "epoch": 2.8853503184713376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.390869140625, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7281397581100464, + "num_tokens": 679846469.0, + "step": 26274 + }, + { + "epoch": 2.8854601361739514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.306972026824951, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6995013356208801, + "num_tokens": 679876950.0, + "step": 26275 + }, + { + "epoch": 2.8855699538765647, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.266599178314209, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6962900161743164, + "num_tokens": 679907480.0, + "step": 26276 + }, + { + "epoch": 2.8856797715791784, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4756131172180176, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7243750095367432, + "num_tokens": 679932416.0, + "step": 26277 + }, + { + "epoch": 2.885789589281792, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.462495803833008, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7131257057189941, + "num_tokens": 679956498.0, + "step": 26278 + }, + { + "epoch": 2.885899406984406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2517752647399902, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7046138644218445, + "num_tokens": 679983183.0, + "step": 26279 + }, + { + "epoch": 2.8860092246870197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2204761505126953, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7395237684249878, + "num_tokens": 680013070.0, + "step": 26280 + }, + { + "epoch": 2.886119042389633, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6512155532836914, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.7002552151679993, + "num_tokens": 680038849.0, + "step": 26281 + }, + { + "epoch": 2.8862288600922468, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419215202331543, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7114608883857727, + "num_tokens": 680064328.0, + "step": 26282 + }, + { + "epoch": 2.8863386777948605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.406822443008423, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6948586702346802, + "num_tokens": 680091408.0, + "step": 26283 + }, + { + "epoch": 2.8864484954974743, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1362884044647217, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7226380109786987, + "num_tokens": 680123767.0, + "step": 26284 + }, + { + "epoch": 2.886558313200088, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8671700954437256, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7060190439224243, + "num_tokens": 680143812.0, + "step": 26285 + }, + { + "epoch": 2.8866681309027014, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.419146776199341, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7382544875144958, + "num_tokens": 680169260.0, + "step": 26286 + }, + { + "epoch": 2.886777948605315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.383289337158203, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.727404773235321, + "num_tokens": 680193558.0, + "step": 26287 + }, + { + "epoch": 2.886887766307929, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.388373374938965, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7075008153915405, + "num_tokens": 680220597.0, + "step": 26288 + }, + { + "epoch": 2.8869975840105426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.788613796234131, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7326604127883911, + "num_tokens": 680240737.0, + "step": 26289 + }, + { + "epoch": 2.8871074017131564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3815364837646484, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7251527905464172, + "num_tokens": 680266014.0, + "step": 26290 + }, + { + "epoch": 2.8872172194157697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.502582550048828, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7185779213905334, + "num_tokens": 680290115.0, + "step": 26291 + }, + { + "epoch": 2.8873270371183835, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.658210515975952, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7077322006225586, + "num_tokens": 680318250.0, + "step": 26292 + }, + { + "epoch": 2.887436854820997, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5093741416931152, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7118765115737915, + "num_tokens": 680342706.0, + "step": 26293 + }, + { + "epoch": 2.887546672523611, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3436737060546875, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7002155184745789, + "num_tokens": 680373243.0, + "step": 26294 + }, + { + "epoch": 2.8876564902262247, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0608599185943604, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7124963402748108, + "num_tokens": 680404986.0, + "step": 26295 + }, + { + "epoch": 2.887766307928838, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.517941951751709, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7075952887535095, + "num_tokens": 680430597.0, + "step": 26296 + }, + { + "epoch": 2.887876125631452, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324582576751709, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7323851585388184, + "num_tokens": 680458734.0, + "step": 26297 + }, + { + "epoch": 2.8879859433340656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.261742353439331, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6916534304618835, + "num_tokens": 680490034.0, + "step": 26298 + }, + { + "epoch": 2.888095761036679, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.385889768600464, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7293145656585693, + "num_tokens": 680515547.0, + "step": 26299 + }, + { + "epoch": 2.8882055787392926, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.532940626144409, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7170494794845581, + "num_tokens": 680538587.0, + "step": 26300 + }, + { + "epoch": 2.8883153964419064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.393580436706543, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7083384990692139, + "num_tokens": 680565636.0, + "step": 26301 + }, + { + "epoch": 2.88842521414452, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.045468330383301, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7230166792869568, + "num_tokens": 680589704.0, + "step": 26302 + }, + { + "epoch": 2.888535031847134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7345850467681885, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7066179513931274, + "num_tokens": 680611431.0, + "step": 26303 + }, + { + "epoch": 2.888644849549747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.631032705307007, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7049195170402527, + "num_tokens": 680634011.0, + "step": 26304 + }, + { + "epoch": 2.888754667252361, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.42688250541687, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7339422702789307, + "num_tokens": 680657458.0, + "step": 26305 + }, + { + "epoch": 2.8888644849549747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3703460693359375, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7428128123283386, + "num_tokens": 680683301.0, + "step": 26306 + }, + { + "epoch": 2.8889743026575885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4020583629608154, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.697693943977356, + "num_tokens": 680709705.0, + "step": 26307 + }, + { + "epoch": 2.8890841203602022, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3715627193450928, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7063419818878174, + "num_tokens": 680736867.0, + "step": 26308 + }, + { + "epoch": 2.8891939380628155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.506549119949341, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7151550054550171, + "num_tokens": 680761365.0, + "step": 26309 + }, + { + "epoch": 2.8893037557654293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.425283670425415, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6995458602905273, + "num_tokens": 680785840.0, + "step": 26310 + }, + { + "epoch": 2.889413573468043, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5405373573303223, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7159124612808228, + "num_tokens": 680811875.0, + "step": 26311 + }, + { + "epoch": 2.889523391170657, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.51078462600708, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7496214509010315, + "num_tokens": 680834686.0, + "step": 26312 + }, + { + "epoch": 2.8896332088732706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324455499649048, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.704479992389679, + "num_tokens": 680863684.0, + "step": 26313 + }, + { + "epoch": 2.889743026575884, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526857852935791, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7190842628479004, + "num_tokens": 680886106.0, + "step": 26314 + }, + { + "epoch": 2.8898528442784976, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2788031101226807, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7086936235427856, + "num_tokens": 680914405.0, + "step": 26315 + }, + { + "epoch": 2.8899626619811114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.296140432357788, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7133817672729492, + "num_tokens": 680943760.0, + "step": 26316 + }, + { + "epoch": 2.890072479683725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.315613031387329, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7266404032707214, + "num_tokens": 680971377.0, + "step": 26317 + }, + { + "epoch": 2.890182297386339, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2227699756622314, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7076652646064758, + "num_tokens": 681002931.0, + "step": 26318 + }, + { + "epoch": 2.8902921150889522, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.33960223197937, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7118030786514282, + "num_tokens": 681030516.0, + "step": 26319 + }, + { + "epoch": 2.890401932791566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2730872631073, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6884763836860657, + "num_tokens": 681059702.0, + "step": 26320 + }, + { + "epoch": 2.8905117504941797, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3197755813598633, + "learning_rate": 1e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.6710212826728821, + "num_tokens": 681092970.0, + "step": 26321 + }, + { + "epoch": 2.890621568196793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5622899532318115, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7320674657821655, + "num_tokens": 681116302.0, + "step": 26322 + }, + { + "epoch": 2.8907313858994073, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.418092727661133, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7082114815711975, + "num_tokens": 681140960.0, + "step": 26323 + }, + { + "epoch": 2.8908412036020206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443096876144409, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7094470858573914, + "num_tokens": 681166078.0, + "step": 26324 + }, + { + "epoch": 2.8909510213046343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.499861001968384, + "learning_rate": 1e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6935474872589111, + "num_tokens": 681194378.0, + "step": 26325 + }, + { + "epoch": 2.891060839007248, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414175271987915, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7100123763084412, + "num_tokens": 681220387.0, + "step": 26326 + }, + { + "epoch": 2.8911706567098614, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4804675579071045, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7084242105484009, + "num_tokens": 681246376.0, + "step": 26327 + }, + { + "epoch": 2.891280474412475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2072582244873047, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6983324885368347, + "num_tokens": 681277320.0, + "step": 26328 + }, + { + "epoch": 2.891390292115089, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.428006410598755, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7289225459098816, + "num_tokens": 681301160.0, + "step": 26329 + }, + { + "epoch": 2.8915001098177027, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6708412170410156, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7248382568359375, + "num_tokens": 681322226.0, + "step": 26330 + }, + { + "epoch": 2.8916099275203164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5692999362945557, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7251189351081848, + "num_tokens": 681345895.0, + "step": 26331 + }, + { + "epoch": 2.8917197452229297, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.627866744995117, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7263625264167786, + "num_tokens": 681369058.0, + "step": 26332 + }, + { + "epoch": 2.8918295629255435, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6437010765075684, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.7051026821136475, + "num_tokens": 681393988.0, + "step": 26333 + }, + { + "epoch": 2.8919393806281573, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4290945529937744, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.728050172328949, + "num_tokens": 681419271.0, + "step": 26334 + }, + { + "epoch": 2.892049198330771, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.548978090286255, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6990230083465576, + "num_tokens": 681445181.0, + "step": 26335 + }, + { + "epoch": 2.8921590160333848, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3806400299072266, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.699349582195282, + "num_tokens": 681478406.0, + "step": 26336 + }, + { + "epoch": 2.892268833735998, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.554800510406494, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.710116982460022, + "num_tokens": 681502338.0, + "step": 26337 + }, + { + "epoch": 2.892378651438612, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.500873327255249, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6931387782096863, + "num_tokens": 681531407.0, + "step": 26338 + }, + { + "epoch": 2.8924884691412256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.156379222869873, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.696966826915741, + "num_tokens": 681563462.0, + "step": 26339 + }, + { + "epoch": 2.8925982868438394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6730308532714844, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7316217422485352, + "num_tokens": 681584214.0, + "step": 26340 + }, + { + "epoch": 2.892708104546453, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3049421310424805, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7267605066299438, + "num_tokens": 681613791.0, + "step": 26341 + }, + { + "epoch": 2.8928179222490664, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.465524435043335, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7033526301383972, + "num_tokens": 681646841.0, + "step": 26342 + }, + { + "epoch": 2.89292773995168, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6527297496795654, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6972283720970154, + "num_tokens": 681672239.0, + "step": 26343 + }, + { + "epoch": 2.893037557654294, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.738189220428467, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7075374722480774, + "num_tokens": 681692934.0, + "step": 26344 + }, + { + "epoch": 2.8931473753569077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4720287322998047, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7338461875915527, + "num_tokens": 681716216.0, + "step": 26345 + }, + { + "epoch": 2.8932571930595214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2701165676116943, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7099781036376953, + "num_tokens": 681745947.0, + "step": 26346 + }, + { + "epoch": 2.8933670107621348, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.746633291244507, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7278097867965698, + "num_tokens": 681764836.0, + "step": 26347 + }, + { + "epoch": 2.8934768284647485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.517397165298462, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.7049781084060669, + "num_tokens": 681790826.0, + "step": 26348 + }, + { + "epoch": 2.8935866461673623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3112306594848633, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7223793864250183, + "num_tokens": 681818482.0, + "step": 26349 + }, + { + "epoch": 2.8936964638699756, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5251214504241943, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6974109411239624, + "num_tokens": 681843663.0, + "step": 26350 + }, + { + "epoch": 2.8938062815725893, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4123497009277344, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7155702114105225, + "num_tokens": 681870011.0, + "step": 26351 + }, + { + "epoch": 2.893916099275203, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.567431688308716, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7278076410293579, + "num_tokens": 681894347.0, + "step": 26352 + }, + { + "epoch": 2.894025916977817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2962236404418945, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7268347144126892, + "num_tokens": 681921147.0, + "step": 26353 + }, + { + "epoch": 2.8941357346804306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3417675495147705, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7264554500579834, + "num_tokens": 681946221.0, + "step": 26354 + }, + { + "epoch": 2.894245552383044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2323966026306152, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6850042939186096, + "num_tokens": 681977249.0, + "step": 26355 + }, + { + "epoch": 2.8943553700856577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3896889686584473, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7106888294219971, + "num_tokens": 682004527.0, + "step": 26356 + }, + { + "epoch": 2.8944651877882714, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4506423473358154, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7366390228271484, + "num_tokens": 682028877.0, + "step": 26357 + }, + { + "epoch": 2.894575005490885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3437132835388184, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7246674299240112, + "num_tokens": 682055359.0, + "step": 26358 + }, + { + "epoch": 2.894684823193499, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.66896390914917, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7221797704696655, + "num_tokens": 682077733.0, + "step": 26359 + }, + { + "epoch": 2.8947946408961123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.63148832321167, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7064322233200073, + "num_tokens": 682099772.0, + "step": 26360 + }, + { + "epoch": 2.894904458598726, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5041427612304688, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7241237759590149, + "num_tokens": 682127088.0, + "step": 26361 + }, + { + "epoch": 2.89501427630134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6830317974090576, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7188663482666016, + "num_tokens": 682149027.0, + "step": 26362 + }, + { + "epoch": 2.8951240940039535, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5351808071136475, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7069872617721558, + "num_tokens": 682172011.0, + "step": 26363 + }, + { + "epoch": 2.8952339117065673, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4097814559936523, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6991479396820068, + "num_tokens": 682199704.0, + "step": 26364 + }, + { + "epoch": 2.8953437294091806, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.655989408493042, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7537174820899963, + "num_tokens": 682219918.0, + "step": 26365 + }, + { + "epoch": 2.8954535471117944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7763659954071045, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7274388074874878, + "num_tokens": 682241287.0, + "step": 26366 + }, + { + "epoch": 2.895563364814408, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.456338405609131, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7103580236434937, + "num_tokens": 682266925.0, + "step": 26367 + }, + { + "epoch": 2.895673182517022, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8441057205200195, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7269515991210938, + "num_tokens": 682287503.0, + "step": 26368 + }, + { + "epoch": 2.8957830002196356, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4473302364349365, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7389333248138428, + "num_tokens": 682311142.0, + "step": 26369 + }, + { + "epoch": 2.895892817922249, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0722951889038086, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7001672983169556, + "num_tokens": 682346342.0, + "step": 26370 + }, + { + "epoch": 2.8960026356248627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4815969467163086, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7104537487030029, + "num_tokens": 682370628.0, + "step": 26371 + }, + { + "epoch": 2.8961124533274765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7604217529296875, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7162066698074341, + "num_tokens": 682393547.0, + "step": 26372 + }, + { + "epoch": 2.89622227103009, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3247733116149902, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7244927883148193, + "num_tokens": 682421741.0, + "step": 26373 + }, + { + "epoch": 2.896332088732704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.322678565979004, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7115911841392517, + "num_tokens": 682449528.0, + "step": 26374 + }, + { + "epoch": 2.8964419064353173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.514406681060791, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7007472515106201, + "num_tokens": 682473028.0, + "step": 26375 + }, + { + "epoch": 2.896551724137931, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4288673400878906, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7071855664253235, + "num_tokens": 682500082.0, + "step": 26376 + }, + { + "epoch": 2.896661541840545, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.25390887260437, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.7019073963165283, + "num_tokens": 682530425.0, + "step": 26377 + }, + { + "epoch": 2.896771359543158, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5515520572662354, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7085587978363037, + "num_tokens": 682554091.0, + "step": 26378 + }, + { + "epoch": 2.896881177245772, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3889267444610596, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7097318172454834, + "num_tokens": 682579492.0, + "step": 26379 + }, + { + "epoch": 2.8969909949483856, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.497076988220215, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7350863218307495, + "num_tokens": 682603660.0, + "step": 26380 + }, + { + "epoch": 2.8971008126509994, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426358938217163, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7048178911209106, + "num_tokens": 682631859.0, + "step": 26381 + }, + { + "epoch": 2.897210630353613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.543675184249878, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7280918955802917, + "num_tokens": 682656306.0, + "step": 26382 + }, + { + "epoch": 2.8973204480562265, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.569035053253174, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7190127372741699, + "num_tokens": 682679307.0, + "step": 26383 + }, + { + "epoch": 2.89743026575884, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.460176706314087, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7203361392021179, + "num_tokens": 682704005.0, + "step": 26384 + }, + { + "epoch": 2.897540083461454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2815401554107666, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.725193202495575, + "num_tokens": 682730369.0, + "step": 26385 + }, + { + "epoch": 2.8976499011640677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.209880828857422, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.7047949433326721, + "num_tokens": 682761701.0, + "step": 26386 + }, + { + "epoch": 2.8977597188666815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.305760383605957, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7263826131820679, + "num_tokens": 682787398.0, + "step": 26387 + }, + { + "epoch": 2.897869536569295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.439610481262207, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.705133855342865, + "num_tokens": 682813490.0, + "step": 26388 + }, + { + "epoch": 2.8979793542719086, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464411735534668, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7259847521781921, + "num_tokens": 682839551.0, + "step": 26389 + }, + { + "epoch": 2.8980891719745223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4781227111816406, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.705971360206604, + "num_tokens": 682864892.0, + "step": 26390 + }, + { + "epoch": 2.898198989677136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430166721343994, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6952844262123108, + "num_tokens": 682892896.0, + "step": 26391 + }, + { + "epoch": 2.89830880737975, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.619598627090454, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7064342498779297, + "num_tokens": 682915798.0, + "step": 26392 + }, + { + "epoch": 2.898418625082363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.354604721069336, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7132140398025513, + "num_tokens": 682940538.0, + "step": 26393 + }, + { + "epoch": 2.898528442784977, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2507879734039307, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6882206797599792, + "num_tokens": 682972109.0, + "step": 26394 + }, + { + "epoch": 2.8986382604875907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5205185413360596, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7081117033958435, + "num_tokens": 682998786.0, + "step": 26395 + }, + { + "epoch": 2.8987480781902044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4952597618103027, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6929323673248291, + "num_tokens": 683025291.0, + "step": 26396 + }, + { + "epoch": 2.898857895892818, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1760144233703613, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7185927629470825, + "num_tokens": 683056370.0, + "step": 26397 + }, + { + "epoch": 2.8989677135954315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.377425193786621, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.713409960269928, + "num_tokens": 683084465.0, + "step": 26398 + }, + { + "epoch": 2.8990775312980452, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.544379949569702, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7159899473190308, + "num_tokens": 683108337.0, + "step": 26399 + }, + { + "epoch": 2.899187349000659, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.380021333694458, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7089120149612427, + "num_tokens": 683134858.0, + "step": 26400 + }, + { + "epoch": 2.8992971667032723, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.597682237625122, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7144341468811035, + "num_tokens": 683158821.0, + "step": 26401 + }, + { + "epoch": 2.899406984405886, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.428309440612793, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7036537528038025, + "num_tokens": 683185864.0, + "step": 26402 + }, + { + "epoch": 2.8995168021085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3553497791290283, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7059303522109985, + "num_tokens": 683211009.0, + "step": 26403 + }, + { + "epoch": 2.8996266198111136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.291501045227051, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7276583909988403, + "num_tokens": 683240253.0, + "step": 26404 + }, + { + "epoch": 2.8997364375137273, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6729519367218018, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7046404480934143, + "num_tokens": 683264149.0, + "step": 26405 + }, + { + "epoch": 2.8998462552163407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4717447757720947, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7017459869384766, + "num_tokens": 683292588.0, + "step": 26406 + }, + { + "epoch": 2.8999560729189544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526033401489258, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7290347218513489, + "num_tokens": 683314964.0, + "step": 26407 + }, + { + "epoch": 2.900065890621568, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.31416392326355, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7532682418823242, + "num_tokens": 683340361.0, + "step": 26408 + }, + { + "epoch": 2.900175708324182, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.0166664123535156, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7190359830856323, + "num_tokens": 683362288.0, + "step": 26409 + }, + { + "epoch": 2.9002855260267957, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5176124572753906, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.711134672164917, + "num_tokens": 683387158.0, + "step": 26410 + }, + { + "epoch": 2.900395343729409, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8642995357513428, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7038230895996094, + "num_tokens": 683408163.0, + "step": 26411 + }, + { + "epoch": 2.9005051614320227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5996522903442383, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7079393863677979, + "num_tokens": 683434116.0, + "step": 26412 + }, + { + "epoch": 2.9006149791346365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.261928081512451, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7137340307235718, + "num_tokens": 683463104.0, + "step": 26413 + }, + { + "epoch": 2.9007247968372503, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3881025314331055, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7101921439170837, + "num_tokens": 683488989.0, + "step": 26414 + }, + { + "epoch": 2.900834614539864, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.239301919937134, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7081120014190674, + "num_tokens": 683520293.0, + "step": 26415 + }, + { + "epoch": 2.9009444322424773, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7067129611968994, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7376343011856079, + "num_tokens": 683542115.0, + "step": 26416 + }, + { + "epoch": 2.901054249945091, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.263059616088867, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7138481140136719, + "num_tokens": 683572409.0, + "step": 26417 + }, + { + "epoch": 2.901164067647705, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6144700050354004, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7168674468994141, + "num_tokens": 683597498.0, + "step": 26418 + }, + { + "epoch": 2.9012738853503186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0995044708251953, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7261152267456055, + "num_tokens": 683632653.0, + "step": 26419 + }, + { + "epoch": 2.9013837030529324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.286349058151245, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.715252161026001, + "num_tokens": 683662090.0, + "step": 26420 + }, + { + "epoch": 2.9014935207555457, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6444506645202637, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7184473276138306, + "num_tokens": 683684165.0, + "step": 26421 + }, + { + "epoch": 2.9016033384581594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.475632905960083, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6991487741470337, + "num_tokens": 683710140.0, + "step": 26422 + }, + { + "epoch": 2.901713156160773, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.350857973098755, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6965575218200684, + "num_tokens": 683738357.0, + "step": 26423 + }, + { + "epoch": 2.9018229738633865, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.261989116668701, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.736977219581604, + "num_tokens": 683765150.0, + "step": 26424 + }, + { + "epoch": 2.9019327915660007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.48687744140625, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6909307837486267, + "num_tokens": 683791127.0, + "step": 26425 + }, + { + "epoch": 2.902042609268614, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.470672607421875, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7207618355751038, + "num_tokens": 683814724.0, + "step": 26426 + }, + { + "epoch": 2.9021524269712278, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6896097660064697, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7453773617744446, + "num_tokens": 683834075.0, + "step": 26427 + }, + { + "epoch": 2.9022622446738415, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.199097156524658, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.727471113204956, + "num_tokens": 683865794.0, + "step": 26428 + }, + { + "epoch": 2.902372062376455, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5791068077087402, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7204946279525757, + "num_tokens": 683891188.0, + "step": 26429 + }, + { + "epoch": 2.9024818800790686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0856852531433105, + "learning_rate": 1e-06, + "loss": 1.1129, + "mean_token_accuracy": 0.6823587417602539, + "num_tokens": 683926150.0, + "step": 26430 + }, + { + "epoch": 2.9025916977816824, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6758241653442383, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7433890104293823, + "num_tokens": 683946763.0, + "step": 26431 + }, + { + "epoch": 2.902701515484296, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.410780668258667, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7184222936630249, + "num_tokens": 683972539.0, + "step": 26432 + }, + { + "epoch": 2.90281133318691, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.648575782775879, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7279126644134521, + "num_tokens": 683995041.0, + "step": 26433 + }, + { + "epoch": 2.902921150889523, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2051994800567627, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6971051692962646, + "num_tokens": 684028733.0, + "step": 26434 + }, + { + "epoch": 2.903030968592137, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4001524448394775, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7079945206642151, + "num_tokens": 684056832.0, + "step": 26435 + }, + { + "epoch": 2.9031407862947507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2585484981536865, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7411713004112244, + "num_tokens": 684085015.0, + "step": 26436 + }, + { + "epoch": 2.9032506039973645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5685365200042725, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7328842282295227, + "num_tokens": 684109006.0, + "step": 26437 + }, + { + "epoch": 2.903360421699978, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5592823028564453, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.730943500995636, + "num_tokens": 684132450.0, + "step": 26438 + }, + { + "epoch": 2.9034702394025915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.156251907348633, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7121390700340271, + "num_tokens": 684164024.0, + "step": 26439 + }, + { + "epoch": 2.9035800571052053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2582573890686035, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.7039825916290283, + "num_tokens": 684193418.0, + "step": 26440 + }, + { + "epoch": 2.903689874807819, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.604856252670288, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7088838815689087, + "num_tokens": 684219420.0, + "step": 26441 + }, + { + "epoch": 2.903799692510433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4161314964294434, + "learning_rate": 1e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7666292190551758, + "num_tokens": 684241687.0, + "step": 26442 + }, + { + "epoch": 2.9039095102130466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.291400909423828, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7199919819831848, + "num_tokens": 684270672.0, + "step": 26443 + }, + { + "epoch": 2.90401932791566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4901742935180664, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7061287760734558, + "num_tokens": 684296671.0, + "step": 26444 + }, + { + "epoch": 2.9041291456182736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376871347427368, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7097876071929932, + "num_tokens": 684322906.0, + "step": 26445 + }, + { + "epoch": 2.9042389633208874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3874776363372803, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.7076581120491028, + "num_tokens": 684352381.0, + "step": 26446 + }, + { + "epoch": 2.904348781023501, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5340588092803955, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7164338231086731, + "num_tokens": 684376296.0, + "step": 26447 + }, + { + "epoch": 2.904458598726115, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5026214122772217, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7153784036636353, + "num_tokens": 684400919.0, + "step": 26448 + }, + { + "epoch": 2.904568416428728, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5805091857910156, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.706581711769104, + "num_tokens": 684428314.0, + "step": 26449 + }, + { + "epoch": 2.904678234131342, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.601057529449463, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7010560035705566, + "num_tokens": 684452540.0, + "step": 26450 + }, + { + "epoch": 2.9047880518339557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3844947814941406, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7068037986755371, + "num_tokens": 684477807.0, + "step": 26451 + }, + { + "epoch": 2.904897869536569, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8020195960998535, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7409464120864868, + "num_tokens": 684496994.0, + "step": 26452 + }, + { + "epoch": 2.905007687239183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2151052951812744, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7025373578071594, + "num_tokens": 684529066.0, + "step": 26453 + }, + { + "epoch": 2.9051175049417965, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5598301887512207, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.733588695526123, + "num_tokens": 684551104.0, + "step": 26454 + }, + { + "epoch": 2.9052273226444103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5547735691070557, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.715833306312561, + "num_tokens": 684577229.0, + "step": 26455 + }, + { + "epoch": 2.905337140347024, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3518741130828857, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7226918935775757, + "num_tokens": 684605572.0, + "step": 26456 + }, + { + "epoch": 2.9054469580496374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1773204803466797, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7198293209075928, + "num_tokens": 684636023.0, + "step": 26457 + }, + { + "epoch": 2.905556775752251, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3997461795806885, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.7077219486236572, + "num_tokens": 684663762.0, + "step": 26458 + }, + { + "epoch": 2.905666593454865, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5921666622161865, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7283729314804077, + "num_tokens": 684685399.0, + "step": 26459 + }, + { + "epoch": 2.9057764111574786, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7269015312194824, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7395646572113037, + "num_tokens": 684706935.0, + "step": 26460 + }, + { + "epoch": 2.9058862288600924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.371877908706665, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7212749123573303, + "num_tokens": 684735676.0, + "step": 26461 + }, + { + "epoch": 2.9059960465627057, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2327489852905273, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7087295055389404, + "num_tokens": 684767002.0, + "step": 26462 + }, + { + "epoch": 2.9061058642653195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4533767700195312, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7043110728263855, + "num_tokens": 684794623.0, + "step": 26463 + }, + { + "epoch": 2.9062156819679332, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4282498359680176, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7139812111854553, + "num_tokens": 684820573.0, + "step": 26464 + }, + { + "epoch": 2.906325499670547, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2025692462921143, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7126870155334473, + "num_tokens": 684848905.0, + "step": 26465 + }, + { + "epoch": 2.9064353173731607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4668636322021484, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7396034002304077, + "num_tokens": 684872597.0, + "step": 26466 + }, + { + "epoch": 2.906545135075774, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4826815128326416, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6874964237213135, + "num_tokens": 684899741.0, + "step": 26467 + }, + { + "epoch": 2.906654952778388, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.151705741882324, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.725479006767273, + "num_tokens": 684922939.0, + "step": 26468 + }, + { + "epoch": 2.9067647704810016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4831631183624268, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7124691605567932, + "num_tokens": 684948105.0, + "step": 26469 + }, + { + "epoch": 2.9068745881836153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.215313673019409, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6999871730804443, + "num_tokens": 684979234.0, + "step": 26470 + }, + { + "epoch": 2.906984405886229, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7064290046691895, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7247695326805115, + "num_tokens": 685001535.0, + "step": 26471 + }, + { + "epoch": 2.9070942235888424, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.576202630996704, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7184692025184631, + "num_tokens": 685024754.0, + "step": 26472 + }, + { + "epoch": 2.907204041291456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.274188995361328, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7182132005691528, + "num_tokens": 685054084.0, + "step": 26473 + }, + { + "epoch": 2.90731385899407, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2273974418640137, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.7036072611808777, + "num_tokens": 685084931.0, + "step": 26474 + }, + { + "epoch": 2.9074236766966837, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4445135593414307, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7029416561126709, + "num_tokens": 685112701.0, + "step": 26475 + }, + { + "epoch": 2.9075334943992974, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4095046520233154, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7098134160041809, + "num_tokens": 685139446.0, + "step": 26476 + }, + { + "epoch": 2.9076433121019107, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7748448848724365, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7336546182632446, + "num_tokens": 685160733.0, + "step": 26477 + }, + { + "epoch": 2.9077531298045245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.177269220352173, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7076863050460815, + "num_tokens": 685189134.0, + "step": 26478 + }, + { + "epoch": 2.9078629475071383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5881330966949463, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7406705021858215, + "num_tokens": 685211228.0, + "step": 26479 + }, + { + "epoch": 2.9079727652097516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.495762348175049, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7099835276603699, + "num_tokens": 685236995.0, + "step": 26480 + }, + { + "epoch": 2.9080825829123653, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.792591094970703, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7099910974502563, + "num_tokens": 685262502.0, + "step": 26481 + }, + { + "epoch": 2.908192400614979, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.684568405151367, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7226265668869019, + "num_tokens": 685283923.0, + "step": 26482 + }, + { + "epoch": 2.908302218317593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3541653156280518, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7164243459701538, + "num_tokens": 685311142.0, + "step": 26483 + }, + { + "epoch": 2.9084120360202066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6148123741149902, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7281811237335205, + "num_tokens": 685334840.0, + "step": 26484 + }, + { + "epoch": 2.90852185372282, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.396944522857666, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7094936370849609, + "num_tokens": 685362203.0, + "step": 26485 + }, + { + "epoch": 2.9086316714254337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5586283206939697, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7077791690826416, + "num_tokens": 685385678.0, + "step": 26486 + }, + { + "epoch": 2.9087414891280474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3174870014190674, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.7018780708312988, + "num_tokens": 685416591.0, + "step": 26487 + }, + { + "epoch": 2.908851306830661, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.665562152862549, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.7116767168045044, + "num_tokens": 685439488.0, + "step": 26488 + }, + { + "epoch": 2.908961124533275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3086984157562256, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7053318023681641, + "num_tokens": 685467009.0, + "step": 26489 + }, + { + "epoch": 2.9090709422358882, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.442030429840088, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.719057023525238, + "num_tokens": 685492831.0, + "step": 26490 + }, + { + "epoch": 2.909180759938502, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4284796714782715, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.7053099870681763, + "num_tokens": 685520269.0, + "step": 26491 + }, + { + "epoch": 2.9092905776411158, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.762834310531616, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7220917344093323, + "num_tokens": 685539928.0, + "step": 26492 + }, + { + "epoch": 2.9094003953437295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6957414150238037, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.697738528251648, + "num_tokens": 685564126.0, + "step": 26493 + }, + { + "epoch": 2.9095102130463433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.34283447265625, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7145596742630005, + "num_tokens": 685590910.0, + "step": 26494 + }, + { + "epoch": 2.9096200307489566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.441551923751831, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7046462297439575, + "num_tokens": 685620084.0, + "step": 26495 + }, + { + "epoch": 2.9097298484515703, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4759421348571777, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7341928482055664, + "num_tokens": 685647562.0, + "step": 26496 + }, + { + "epoch": 2.909839666154184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2938787937164307, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7060378193855286, + "num_tokens": 685675349.0, + "step": 26497 + }, + { + "epoch": 2.909949483856798, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4789278507232666, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7282845973968506, + "num_tokens": 685698449.0, + "step": 26498 + }, + { + "epoch": 2.9100593015594116, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3474934101104736, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6903374791145325, + "num_tokens": 685727170.0, + "step": 26499 + }, + { + "epoch": 2.910169119262025, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1702189445495605, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.7000364661216736, + "num_tokens": 685759830.0, + "step": 26500 + }, + { + "epoch": 2.9102789369646387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2054831981658936, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7171705961227417, + "num_tokens": 685790878.0, + "step": 26501 + }, + { + "epoch": 2.9103887546672524, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.166618585586548, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7209786772727966, + "num_tokens": 685822032.0, + "step": 26502 + }, + { + "epoch": 2.9104985723698658, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.413327217102051, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7092512845993042, + "num_tokens": 685849320.0, + "step": 26503 + }, + { + "epoch": 2.91060839007248, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.543619155883789, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6995823383331299, + "num_tokens": 685873771.0, + "step": 26504 + }, + { + "epoch": 2.9107182077750933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1997873783111572, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6952983736991882, + "num_tokens": 685904227.0, + "step": 26505 + }, + { + "epoch": 2.910828025477707, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.446349620819092, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7211477160453796, + "num_tokens": 685931888.0, + "step": 26506 + }, + { + "epoch": 2.910937843180321, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5194122791290283, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7181205153465271, + "num_tokens": 685955824.0, + "step": 26507 + }, + { + "epoch": 2.911047660882934, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.491549491882324, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7115843296051025, + "num_tokens": 685980405.0, + "step": 26508 + }, + { + "epoch": 2.911157478585548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.714984655380249, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7167676687240601, + "num_tokens": 686000283.0, + "step": 26509 + }, + { + "epoch": 2.9112672962881616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4668326377868652, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7293492555618286, + "num_tokens": 686024486.0, + "step": 26510 + }, + { + "epoch": 2.9113771139907754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.267563819885254, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7427866458892822, + "num_tokens": 686049863.0, + "step": 26511 + }, + { + "epoch": 2.911486931693389, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.449657440185547, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7246869206428528, + "num_tokens": 686074858.0, + "step": 26512 + }, + { + "epoch": 2.9115967493960024, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6306233406066895, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7105039358139038, + "num_tokens": 686098909.0, + "step": 26513 + }, + { + "epoch": 2.911706567098616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4522671699523926, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7077678442001343, + "num_tokens": 686122999.0, + "step": 26514 + }, + { + "epoch": 2.91181638480123, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6046833992004395, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7240456342697144, + "num_tokens": 686146006.0, + "step": 26515 + }, + { + "epoch": 2.9119262025038437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.285897254943848, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.703298032283783, + "num_tokens": 686179381.0, + "step": 26516 + }, + { + "epoch": 2.9120360202064575, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.346794605255127, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7098957896232605, + "num_tokens": 686207753.0, + "step": 26517 + }, + { + "epoch": 2.912145837909071, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.448852777481079, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7055450677871704, + "num_tokens": 686233578.0, + "step": 26518 + }, + { + "epoch": 2.9122556556116845, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539217710494995, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7260493040084839, + "num_tokens": 686258649.0, + "step": 26519 + }, + { + "epoch": 2.9123654733142983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3546035289764404, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7578414678573608, + "num_tokens": 686282824.0, + "step": 26520 + }, + { + "epoch": 2.912475291016912, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1072487831115723, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.703113317489624, + "num_tokens": 686315799.0, + "step": 26521 + }, + { + "epoch": 2.912585108719526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4914803504943848, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7274532318115234, + "num_tokens": 686339348.0, + "step": 26522 + }, + { + "epoch": 2.912694926422139, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.777143716812134, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7404204607009888, + "num_tokens": 686360029.0, + "step": 26523 + }, + { + "epoch": 2.912804744124753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.405289888381958, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6950063109397888, + "num_tokens": 686387739.0, + "step": 26524 + }, + { + "epoch": 2.9129145618273666, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5824222564697266, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7206220626831055, + "num_tokens": 686410635.0, + "step": 26525 + }, + { + "epoch": 2.9130243795299804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4326460361480713, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7262132167816162, + "num_tokens": 686436426.0, + "step": 26526 + }, + { + "epoch": 2.913134197232594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1574342250823975, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7010085582733154, + "num_tokens": 686466815.0, + "step": 26527 + }, + { + "epoch": 2.9132440149352075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5473473072052, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6994379162788391, + "num_tokens": 686491711.0, + "step": 26528 + }, + { + "epoch": 2.913353832637821, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513180732727051, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7016764879226685, + "num_tokens": 686515784.0, + "step": 26529 + }, + { + "epoch": 2.913463650340435, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 2.732844591140747, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7087070345878601, + "num_tokens": 686538227.0, + "step": 26530 + }, + { + "epoch": 2.9135734680430483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.301586627960205, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6927748322486877, + "num_tokens": 686568077.0, + "step": 26531 + }, + { + "epoch": 2.913683285745662, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.261218309402466, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.711397647857666, + "num_tokens": 686597018.0, + "step": 26532 + }, + { + "epoch": 2.913793103448276, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6301848888397217, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7187566161155701, + "num_tokens": 686618219.0, + "step": 26533 + }, + { + "epoch": 2.9139029211508896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4789116382598877, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7161303758621216, + "num_tokens": 686642533.0, + "step": 26534 + }, + { + "epoch": 2.9140127388535033, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6534271240234375, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6966972351074219, + "num_tokens": 686665114.0, + "step": 26535 + }, + { + "epoch": 2.9141225565561166, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.335787296295166, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6930113434791565, + "num_tokens": 686695160.0, + "step": 26536 + }, + { + "epoch": 2.9142323742587304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6156485080718994, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6990206241607666, + "num_tokens": 686717975.0, + "step": 26537 + }, + { + "epoch": 2.914342191961344, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4816513061523438, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7176163196563721, + "num_tokens": 686744794.0, + "step": 26538 + }, + { + "epoch": 2.914452009663958, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3748910427093506, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6936997771263123, + "num_tokens": 686770704.0, + "step": 26539 + }, + { + "epoch": 2.9145618273665717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2015068531036377, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7235800623893738, + "num_tokens": 686800460.0, + "step": 26540 + }, + { + "epoch": 2.914671645069185, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.199260950088501, + "learning_rate": 1e-06, + "loss": 1.1122, + "mean_token_accuracy": 0.67544025182724, + "num_tokens": 686835550.0, + "step": 26541 + }, + { + "epoch": 2.9147814627717987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.325981616973877, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7174409627914429, + "num_tokens": 686860874.0, + "step": 26542 + }, + { + "epoch": 2.9148912804744125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3388874530792236, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7039773464202881, + "num_tokens": 686889331.0, + "step": 26543 + }, + { + "epoch": 2.9150010981770262, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.342247247695923, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7085444927215576, + "num_tokens": 686919202.0, + "step": 26544 + }, + { + "epoch": 2.91511091587964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.503941535949707, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7082322239875793, + "num_tokens": 686942904.0, + "step": 26545 + }, + { + "epoch": 2.9152207335822533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2547404766082764, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7175620794296265, + "num_tokens": 686973126.0, + "step": 26546 + }, + { + "epoch": 2.915330551284867, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.499786853790283, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7161934971809387, + "num_tokens": 686998311.0, + "step": 26547 + }, + { + "epoch": 2.915440368987481, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.31982159614563, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7077294588088989, + "num_tokens": 687025114.0, + "step": 26548 + }, + { + "epoch": 2.9155501866900946, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.598137378692627, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7447623610496521, + "num_tokens": 687050089.0, + "step": 26549 + }, + { + "epoch": 2.9156600043927083, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328073501586914, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7008870840072632, + "num_tokens": 687079672.0, + "step": 26550 + }, + { + "epoch": 2.9157698220953217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4714434146881104, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7250463962554932, + "num_tokens": 687104830.0, + "step": 26551 + }, + { + "epoch": 2.9158796397979354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7349016666412354, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7342703342437744, + "num_tokens": 687124592.0, + "step": 26552 + }, + { + "epoch": 2.915989457500549, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.418295383453369, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7324987053871155, + "num_tokens": 687149660.0, + "step": 26553 + }, + { + "epoch": 2.9160992752031625, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5604844093322754, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6965821981430054, + "num_tokens": 687173940.0, + "step": 26554 + }, + { + "epoch": 2.9162090929057767, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.24873685836792, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7095133066177368, + "num_tokens": 687206727.0, + "step": 26555 + }, + { + "epoch": 2.91631891060839, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6060431003570557, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7248731851577759, + "num_tokens": 687227811.0, + "step": 26556 + }, + { + "epoch": 2.9164287283110037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2492096424102783, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7144370675086975, + "num_tokens": 687256730.0, + "step": 26557 + }, + { + "epoch": 2.9165385460136175, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.296031951904297, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.732507050037384, + "num_tokens": 687281628.0, + "step": 26558 + }, + { + "epoch": 2.916648363716231, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1719868183135986, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7242248058319092, + "num_tokens": 687310682.0, + "step": 26559 + }, + { + "epoch": 2.9167581814188446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2020480632781982, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6984866857528687, + "num_tokens": 687340045.0, + "step": 26560 + }, + { + "epoch": 2.9168679991214583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3633949756622314, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.717998743057251, + "num_tokens": 687365898.0, + "step": 26561 + }, + { + "epoch": 2.916977816824072, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3214738368988037, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7428712844848633, + "num_tokens": 687390425.0, + "step": 26562 + }, + { + "epoch": 2.917087634526686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.677485466003418, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7202915549278259, + "num_tokens": 687412248.0, + "step": 26563 + }, + { + "epoch": 2.917197452229299, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 6.997679233551025, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.711263120174408, + "num_tokens": 687438042.0, + "step": 26564 + }, + { + "epoch": 2.917307269931913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3939125537872314, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.699401319026947, + "num_tokens": 687464699.0, + "step": 26565 + }, + { + "epoch": 2.9174170876345267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4947547912597656, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7188282608985901, + "num_tokens": 687490154.0, + "step": 26566 + }, + { + "epoch": 2.9175269053371404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6548993587493896, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7341022491455078, + "num_tokens": 687510151.0, + "step": 26567 + }, + { + "epoch": 2.917636723039754, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4388298988342285, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7070578336715698, + "num_tokens": 687534651.0, + "step": 26568 + }, + { + "epoch": 2.9177465407423675, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5639238357543945, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.702478289604187, + "num_tokens": 687557368.0, + "step": 26569 + }, + { + "epoch": 2.9178563584449813, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.435884714126587, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7170984148979187, + "num_tokens": 687581983.0, + "step": 26570 + }, + { + "epoch": 2.917966176147595, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2853384017944336, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7018321752548218, + "num_tokens": 687610019.0, + "step": 26571 + }, + { + "epoch": 2.9180759938502088, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.147838592529297, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6929628849029541, + "num_tokens": 687644128.0, + "step": 26572 + }, + { + "epoch": 2.9181858115528225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.850844383239746, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6926338076591492, + "num_tokens": 687664735.0, + "step": 26573 + }, + { + "epoch": 2.918295629255436, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3775429725646973, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.746290385723114, + "num_tokens": 687687794.0, + "step": 26574 + }, + { + "epoch": 2.9184054469580496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5100982189178467, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7068392038345337, + "num_tokens": 687711311.0, + "step": 26575 + }, + { + "epoch": 2.9185152646606634, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6822080612182617, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7232897281646729, + "num_tokens": 687733156.0, + "step": 26576 + }, + { + "epoch": 2.918625082363277, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6008236408233643, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7209247350692749, + "num_tokens": 687755265.0, + "step": 26577 + }, + { + "epoch": 2.918734900065891, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.669003486633301, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7198525667190552, + "num_tokens": 687780821.0, + "step": 26578 + }, + { + "epoch": 2.918844717768504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.642115592956543, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7287461757659912, + "num_tokens": 687803688.0, + "step": 26579 + }, + { + "epoch": 2.918954535471118, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.423447847366333, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7283563613891602, + "num_tokens": 687829995.0, + "step": 26580 + }, + { + "epoch": 2.9190643531737317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.248704671859741, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7078889608383179, + "num_tokens": 687859682.0, + "step": 26581 + }, + { + "epoch": 2.919174170876345, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1357226371765137, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6963402032852173, + "num_tokens": 687893976.0, + "step": 26582 + }, + { + "epoch": 2.9192839885789588, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2750942707061768, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7338880896568298, + "num_tokens": 687922005.0, + "step": 26583 + }, + { + "epoch": 2.9193938062815725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.491010904312134, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7207949161529541, + "num_tokens": 687946333.0, + "step": 26584 + }, + { + "epoch": 2.9195036239841863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6526620388031006, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.74202960729599, + "num_tokens": 687966792.0, + "step": 26585 + }, + { + "epoch": 2.9196134416868, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.381012439727783, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7196937203407288, + "num_tokens": 687993284.0, + "step": 26586 + }, + { + "epoch": 2.9197232593894134, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5517067909240723, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7159786820411682, + "num_tokens": 688016354.0, + "step": 26587 + }, + { + "epoch": 2.919833077092027, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.351289749145508, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7291457056999207, + "num_tokens": 688042962.0, + "step": 26588 + }, + { + "epoch": 2.919942894794641, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.699413537979126, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7073841691017151, + "num_tokens": 688065254.0, + "step": 26589 + }, + { + "epoch": 2.9200527124972546, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2200400829315186, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6858209371566772, + "num_tokens": 688094167.0, + "step": 26590 + }, + { + "epoch": 2.9201625301998684, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4668831825256348, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7205822467803955, + "num_tokens": 688121059.0, + "step": 26591 + }, + { + "epoch": 2.9202723479024817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.392505168914795, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7113263010978699, + "num_tokens": 688148228.0, + "step": 26592 + }, + { + "epoch": 2.9203821656050954, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.731940746307373, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7261418104171753, + "num_tokens": 688168188.0, + "step": 26593 + }, + { + "epoch": 2.920491983307709, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.466212272644043, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.713133692741394, + "num_tokens": 688193376.0, + "step": 26594 + }, + { + "epoch": 2.920601801010323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.270785331726074, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7069189548492432, + "num_tokens": 688223449.0, + "step": 26595 + }, + { + "epoch": 2.9207116187129367, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.521052837371826, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7205023765563965, + "num_tokens": 688248470.0, + "step": 26596 + }, + { + "epoch": 2.92082143641555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.275355339050293, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7385280728340149, + "num_tokens": 688275671.0, + "step": 26597 + }, + { + "epoch": 2.920931254118164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.305936098098755, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7155400514602661, + "num_tokens": 688305394.0, + "step": 26598 + }, + { + "epoch": 2.9210410718207775, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6745376586914062, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7114280462265015, + "num_tokens": 688327261.0, + "step": 26599 + }, + { + "epoch": 2.9211508895233913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.412060022354126, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7324771881103516, + "num_tokens": 688354440.0, + "step": 26600 + }, + { + "epoch": 2.921260707226005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7343873977661133, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7161152362823486, + "num_tokens": 688376412.0, + "step": 26601 + }, + { + "epoch": 2.9213705249286184, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.257139205932617, + "learning_rate": 1e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6870543956756592, + "num_tokens": 688408083.0, + "step": 26602 + }, + { + "epoch": 2.921480342631232, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.551882266998291, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7359178066253662, + "num_tokens": 688430832.0, + "step": 26603 + }, + { + "epoch": 2.921590160333846, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4849321842193604, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7109906077384949, + "num_tokens": 688455208.0, + "step": 26604 + }, + { + "epoch": 2.921699978036459, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43502140045166, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7192113995552063, + "num_tokens": 688480867.0, + "step": 26605 + }, + { + "epoch": 2.9218097957390734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3602960109710693, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7015198469161987, + "num_tokens": 688508797.0, + "step": 26606 + }, + { + "epoch": 2.9219196134416867, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3030130863189697, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6907480955123901, + "num_tokens": 688539193.0, + "step": 26607 + }, + { + "epoch": 2.9220294311443005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2315895557403564, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7093563079833984, + "num_tokens": 688568712.0, + "step": 26608 + }, + { + "epoch": 2.9221392488469142, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.695385456085205, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7119929194450378, + "num_tokens": 688592866.0, + "step": 26609 + }, + { + "epoch": 2.9222490665495275, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.547471523284912, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7206155061721802, + "num_tokens": 688617035.0, + "step": 26610 + }, + { + "epoch": 2.9223588842521413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3394839763641357, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7241236567497253, + "num_tokens": 688642534.0, + "step": 26611 + }, + { + "epoch": 2.922468701954755, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5221848487854004, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7127611637115479, + "num_tokens": 688666128.0, + "step": 26612 + }, + { + "epoch": 2.922578519657369, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5979650020599365, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7426393628120422, + "num_tokens": 688688505.0, + "step": 26613 + }, + { + "epoch": 2.9226883373599826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1961910724639893, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7010621428489685, + "num_tokens": 688719595.0, + "step": 26614 + }, + { + "epoch": 2.922798155062596, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7210938930511475, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6977921724319458, + "num_tokens": 688741262.0, + "step": 26615 + }, + { + "epoch": 2.9229079727652096, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.553623676300049, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.715930700302124, + "num_tokens": 688764920.0, + "step": 26616 + }, + { + "epoch": 2.9230177904678234, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.303032636642456, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6906802654266357, + "num_tokens": 688796330.0, + "step": 26617 + }, + { + "epoch": 2.923127608170437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6584739685058594, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.739517331123352, + "num_tokens": 688817560.0, + "step": 26618 + }, + { + "epoch": 2.923237425873051, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.50042986869812, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7225821018218994, + "num_tokens": 688840841.0, + "step": 26619 + }, + { + "epoch": 2.9233472435756642, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.650085926055908, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7180846333503723, + "num_tokens": 688863396.0, + "step": 26620 + }, + { + "epoch": 2.923457061278278, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.757884979248047, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7332848310470581, + "num_tokens": 688884373.0, + "step": 26621 + }, + { + "epoch": 2.9235668789808917, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6011343002319336, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7442620992660522, + "num_tokens": 688906250.0, + "step": 26622 + }, + { + "epoch": 2.9236766966835055, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6297833919525146, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7212876081466675, + "num_tokens": 688927619.0, + "step": 26623 + }, + { + "epoch": 2.9237865143861193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3110671043395996, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.735079288482666, + "num_tokens": 688955302.0, + "step": 26624 + }, + { + "epoch": 2.9238963320887326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4243407249450684, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6999601721763611, + "num_tokens": 688985863.0, + "step": 26625 + }, + { + "epoch": 2.9240061497913463, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3998777866363525, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.727983832359314, + "num_tokens": 689013871.0, + "step": 26626 + }, + { + "epoch": 2.92411596749396, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6474223136901855, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7153334021568298, + "num_tokens": 689036331.0, + "step": 26627 + }, + { + "epoch": 2.924225785196574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.338432788848877, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7144113779067993, + "num_tokens": 689064330.0, + "step": 26628 + }, + { + "epoch": 2.9243356028991876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5906550884246826, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.704047441482544, + "num_tokens": 689089193.0, + "step": 26629 + }, + { + "epoch": 2.924445420601801, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.347362756729126, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7115175127983093, + "num_tokens": 689117563.0, + "step": 26630 + }, + { + "epoch": 2.9245552383044147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.584176540374756, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7204524278640747, + "num_tokens": 689141597.0, + "step": 26631 + }, + { + "epoch": 2.9246650560070284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.372739791870117, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7310408353805542, + "num_tokens": 689167430.0, + "step": 26632 + }, + { + "epoch": 2.9247748737096417, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1379551887512207, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7292723655700684, + "num_tokens": 689199668.0, + "step": 26633 + }, + { + "epoch": 2.924884691412256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2735533714294434, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7140286564826965, + "num_tokens": 689232059.0, + "step": 26634 + }, + { + "epoch": 2.9249945091148692, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.408684015274048, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6977396011352539, + "num_tokens": 689259070.0, + "step": 26635 + }, + { + "epoch": 2.925104326817483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.445204257965088, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6897963285446167, + "num_tokens": 689284076.0, + "step": 26636 + }, + { + "epoch": 2.9252141445200968, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5645101070404053, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7036439180374146, + "num_tokens": 689308431.0, + "step": 26637 + }, + { + "epoch": 2.92532396222271, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.162668228149414, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.7154457569122314, + "num_tokens": 689340239.0, + "step": 26638 + }, + { + "epoch": 2.925433779925324, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2818074226379395, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7191702127456665, + "num_tokens": 689369272.0, + "step": 26639 + }, + { + "epoch": 2.9255435976279376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.213026523590088, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6875935792922974, + "num_tokens": 689400323.0, + "step": 26640 + }, + { + "epoch": 2.9256534153305513, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7289350032806396, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7197908759117126, + "num_tokens": 689422422.0, + "step": 26641 + }, + { + "epoch": 2.925763233033165, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.360456943511963, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6922896504402161, + "num_tokens": 689451044.0, + "step": 26642 + }, + { + "epoch": 2.9258730507357784, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6053946018218994, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7303153872489929, + "num_tokens": 689473795.0, + "step": 26643 + }, + { + "epoch": 2.925982868438392, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.303314447402954, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7072513699531555, + "num_tokens": 689503380.0, + "step": 26644 + }, + { + "epoch": 2.926092686141006, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6645376682281494, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7098557949066162, + "num_tokens": 689526467.0, + "step": 26645 + }, + { + "epoch": 2.9262025038436197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.360239267349243, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6996021866798401, + "num_tokens": 689555347.0, + "step": 26646 + }, + { + "epoch": 2.9263123215462334, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1026432514190674, + "learning_rate": 1e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.6793244481086731, + "num_tokens": 689592022.0, + "step": 26647 + }, + { + "epoch": 2.9264221392488468, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621617555618286, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7158808708190918, + "num_tokens": 689612055.0, + "step": 26648 + }, + { + "epoch": 2.9265319569514605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4255950450897217, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6988046169281006, + "num_tokens": 689638227.0, + "step": 26649 + }, + { + "epoch": 2.9266417746540743, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.257457971572876, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7162579298019409, + "num_tokens": 689669711.0, + "step": 26650 + }, + { + "epoch": 2.926751592356688, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3682589530944824, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7001947164535522, + "num_tokens": 689699272.0, + "step": 26651 + }, + { + "epoch": 2.926861410059302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3119897842407227, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6985782384872437, + "num_tokens": 689729717.0, + "step": 26652 + }, + { + "epoch": 2.926971227761915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3996472358703613, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7106714844703674, + "num_tokens": 689755552.0, + "step": 26653 + }, + { + "epoch": 2.927081045464529, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.630725860595703, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6860221028327942, + "num_tokens": 689780275.0, + "step": 26654 + }, + { + "epoch": 2.9271908631671426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.394364595413208, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7104515433311462, + "num_tokens": 689806057.0, + "step": 26655 + }, + { + "epoch": 2.9273006808697564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4769225120544434, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7191891074180603, + "num_tokens": 689831959.0, + "step": 26656 + }, + { + "epoch": 2.92741049857237, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2269647121429443, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7192800045013428, + "num_tokens": 689863640.0, + "step": 26657 + }, + { + "epoch": 2.9275203162749834, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3935067653656006, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7064952850341797, + "num_tokens": 689892249.0, + "step": 26658 + }, + { + "epoch": 2.927630133977597, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4833199977874756, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7173551917076111, + "num_tokens": 689916482.0, + "step": 26659 + }, + { + "epoch": 2.927739951680211, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.463759422302246, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7253506183624268, + "num_tokens": 689941566.0, + "step": 26660 + }, + { + "epoch": 2.9278497693828243, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3059816360473633, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7178685665130615, + "num_tokens": 689968884.0, + "step": 26661 + }, + { + "epoch": 2.927959587085438, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3754332065582275, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7027420997619629, + "num_tokens": 689997689.0, + "step": 26662 + }, + { + "epoch": 2.9280694047880518, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.592139959335327, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7264988422393799, + "num_tokens": 690020953.0, + "step": 26663 + }, + { + "epoch": 2.9281792224906655, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3569729328155518, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7055285573005676, + "num_tokens": 690049019.0, + "step": 26664 + }, + { + "epoch": 2.9282890401932793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4476020336151123, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7214802503585815, + "num_tokens": 690075642.0, + "step": 26665 + }, + { + "epoch": 2.9283988578958926, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.186229944229126, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7113747000694275, + "num_tokens": 690107687.0, + "step": 26666 + }, + { + "epoch": 2.9285086755985064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8982913494110107, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7106152772903442, + "num_tokens": 690128154.0, + "step": 26667 + }, + { + "epoch": 2.92861849330112, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.610250949859619, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7448939681053162, + "num_tokens": 690149424.0, + "step": 26668 + }, + { + "epoch": 2.928728311003734, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4267847537994385, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7245931625366211, + "num_tokens": 690174474.0, + "step": 26669 + }, + { + "epoch": 2.9288381287063476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.598111391067505, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7198242545127869, + "num_tokens": 690198756.0, + "step": 26670 + }, + { + "epoch": 2.928947946408961, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7122411727905273, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7479479908943176, + "num_tokens": 690219581.0, + "step": 26671 + }, + { + "epoch": 2.9290577641115747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3259899616241455, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7019232511520386, + "num_tokens": 690249638.0, + "step": 26672 + }, + { + "epoch": 2.9291675818141885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3977627754211426, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6976464986801147, + "num_tokens": 690275987.0, + "step": 26673 + }, + { + "epoch": 2.929277399516802, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.78279709815979, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7397993206977844, + "num_tokens": 690295474.0, + "step": 26674 + }, + { + "epoch": 2.929387217219416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7833383083343506, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7226848602294922, + "num_tokens": 690315944.0, + "step": 26675 + }, + { + "epoch": 2.9294970349220293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5540668964385986, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7212351560592651, + "num_tokens": 690340715.0, + "step": 26676 + }, + { + "epoch": 2.929606852624643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.632192373275757, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7187631130218506, + "num_tokens": 690361929.0, + "step": 26677 + }, + { + "epoch": 2.929716670327257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.258971691131592, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.7010249495506287, + "num_tokens": 690391351.0, + "step": 26678 + }, + { + "epoch": 2.9298264880298706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.232072591781616, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7040754556655884, + "num_tokens": 690421413.0, + "step": 26679 + }, + { + "epoch": 2.9299363057324843, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5987813472747803, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6914371252059937, + "num_tokens": 690446312.0, + "step": 26680 + }, + { + "epoch": 2.9300461234350976, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5640292167663574, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6877204775810242, + "num_tokens": 690471830.0, + "step": 26681 + }, + { + "epoch": 2.9301559411377114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426285743713379, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6971577405929565, + "num_tokens": 690501404.0, + "step": 26682 + }, + { + "epoch": 2.930265758840325, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5609631538391113, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.713215172290802, + "num_tokens": 690524742.0, + "step": 26683 + }, + { + "epoch": 2.9303755765429385, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6795246601104736, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7542656064033508, + "num_tokens": 690546333.0, + "step": 26684 + }, + { + "epoch": 2.9304853942455527, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5286524295806885, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7315779328346252, + "num_tokens": 690569697.0, + "step": 26685 + }, + { + "epoch": 2.930595211948166, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5302398204803467, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7568361759185791, + "num_tokens": 690593988.0, + "step": 26686 + }, + { + "epoch": 2.9307050296507797, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4968841075897217, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7060812711715698, + "num_tokens": 690618230.0, + "step": 26687 + }, + { + "epoch": 2.9308148473533935, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6578149795532227, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7239333391189575, + "num_tokens": 690639264.0, + "step": 26688 + }, + { + "epoch": 2.930924665056007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.624171733856201, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7361478805541992, + "num_tokens": 690661980.0, + "step": 26689 + }, + { + "epoch": 2.9310344827586206, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621157646179199, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7212479710578918, + "num_tokens": 690685725.0, + "step": 26690 + }, + { + "epoch": 2.9311443004612343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5504226684570312, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7122597098350525, + "num_tokens": 690710122.0, + "step": 26691 + }, + { + "epoch": 2.931254118163848, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.530647039413452, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7168018817901611, + "num_tokens": 690737360.0, + "step": 26692 + }, + { + "epoch": 2.931363935866462, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6960771083831787, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7116565704345703, + "num_tokens": 690758151.0, + "step": 26693 + }, + { + "epoch": 2.931473753569075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4342823028564453, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7456235885620117, + "num_tokens": 690782626.0, + "step": 26694 + }, + { + "epoch": 2.931583571271689, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.689532995223999, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7305845022201538, + "num_tokens": 690804268.0, + "step": 26695 + }, + { + "epoch": 2.9316933889743026, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3263700008392334, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7085844278335571, + "num_tokens": 690833758.0, + "step": 26696 + }, + { + "epoch": 2.9318032066769164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.18721866607666, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6904253959655762, + "num_tokens": 690867639.0, + "step": 26697 + }, + { + "epoch": 2.93191302437953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.129981756210327, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6975020170211792, + "num_tokens": 690902197.0, + "step": 26698 + }, + { + "epoch": 2.9320228420821435, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2640082836151123, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.696586549282074, + "num_tokens": 690932496.0, + "step": 26699 + }, + { + "epoch": 2.9321326597847572, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6538238525390625, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7196489572525024, + "num_tokens": 690956064.0, + "step": 26700 + }, + { + "epoch": 2.932242477487371, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.430863618850708, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7047992944717407, + "num_tokens": 690980200.0, + "step": 26701 + }, + { + "epoch": 2.9323522951899847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.52717661857605, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7184160947799683, + "num_tokens": 691003199.0, + "step": 26702 + }, + { + "epoch": 2.9324621128925985, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.601203680038452, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7157967686653137, + "num_tokens": 691025235.0, + "step": 26703 + }, + { + "epoch": 2.932571930595212, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4228596687316895, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7411923408508301, + "num_tokens": 691049426.0, + "step": 26704 + }, + { + "epoch": 2.9326817482978256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3049237728118896, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.713512122631073, + "num_tokens": 691080489.0, + "step": 26705 + }, + { + "epoch": 2.9327915660004393, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2999441623687744, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.7068071365356445, + "num_tokens": 691109836.0, + "step": 26706 + }, + { + "epoch": 2.932901383703053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5113327503204346, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7171177864074707, + "num_tokens": 691133883.0, + "step": 26707 + }, + { + "epoch": 2.933011201405667, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4164624214172363, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7141573429107666, + "num_tokens": 691159481.0, + "step": 26708 + }, + { + "epoch": 2.93312101910828, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.99994158744812, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7207962274551392, + "num_tokens": 691178012.0, + "step": 26709 + }, + { + "epoch": 2.933230836810894, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6707417964935303, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7127081155776978, + "num_tokens": 691200489.0, + "step": 26710 + }, + { + "epoch": 2.9333406545135077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8444621562957764, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7287474870681763, + "num_tokens": 691220284.0, + "step": 26711 + }, + { + "epoch": 2.933450472216121, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3116986751556396, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7406913042068481, + "num_tokens": 691247468.0, + "step": 26712 + }, + { + "epoch": 2.9335602899187347, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5480213165283203, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7107570171356201, + "num_tokens": 691274301.0, + "step": 26713 + }, + { + "epoch": 2.9336701076213485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.813450813293457, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7260692119598389, + "num_tokens": 691294714.0, + "step": 26714 + }, + { + "epoch": 2.9337799253239623, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400603771209717, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6971567273139954, + "num_tokens": 691320042.0, + "step": 26715 + }, + { + "epoch": 2.933889743026576, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.00570011138916, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7376430034637451, + "num_tokens": 691336829.0, + "step": 26716 + }, + { + "epoch": 2.9339995607291893, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1671268939971924, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7343329191207886, + "num_tokens": 691366283.0, + "step": 26717 + }, + { + "epoch": 2.934109378431803, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.559455156326294, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7283240556716919, + "num_tokens": 691390445.0, + "step": 26718 + }, + { + "epoch": 2.934219196134417, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.441774845123291, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.722732424736023, + "num_tokens": 691416398.0, + "step": 26719 + }, + { + "epoch": 2.9343290138370306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.078221559524536, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6947444677352905, + "num_tokens": 691451529.0, + "step": 26720 + }, + { + "epoch": 2.9344388315396444, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4380807876586914, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7224298715591431, + "num_tokens": 691476310.0, + "step": 26721 + }, + { + "epoch": 2.9345486492422577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5963730812072754, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7301778793334961, + "num_tokens": 691499514.0, + "step": 26722 + }, + { + "epoch": 2.9346584669448714, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6180853843688965, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7104334235191345, + "num_tokens": 691521563.0, + "step": 26723 + }, + { + "epoch": 2.934768284647485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4860775470733643, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7099918127059937, + "num_tokens": 691545425.0, + "step": 26724 + }, + { + "epoch": 2.934878102350099, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3845303058624268, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6883803606033325, + "num_tokens": 691571772.0, + "step": 26725 + }, + { + "epoch": 2.9349879200527127, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5382845401763916, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6988331079483032, + "num_tokens": 691597828.0, + "step": 26726 + }, + { + "epoch": 2.935097737755326, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3642261028289795, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6879757046699524, + "num_tokens": 691627954.0, + "step": 26727 + }, + { + "epoch": 2.9352075554579398, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.570234775543213, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6855820417404175, + "num_tokens": 691655678.0, + "step": 26728 + }, + { + "epoch": 2.9353173731605535, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7362589836120605, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.7107153534889221, + "num_tokens": 691681956.0, + "step": 26729 + }, + { + "epoch": 2.9354271908631673, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.403569459915161, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7182781100273132, + "num_tokens": 691708417.0, + "step": 26730 + }, + { + "epoch": 2.935537008565781, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.773439645767212, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7107024192810059, + "num_tokens": 691728792.0, + "step": 26731 + }, + { + "epoch": 2.9356468262683943, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.564608097076416, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7275683879852295, + "num_tokens": 691751611.0, + "step": 26732 + }, + { + "epoch": 2.935756643971008, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7166967391967773, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7107419371604919, + "num_tokens": 691773539.0, + "step": 26733 + }, + { + "epoch": 2.935866461673622, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.58988881111145, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7199355363845825, + "num_tokens": 691796543.0, + "step": 26734 + }, + { + "epoch": 2.935976279376235, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2926318645477295, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7222589254379272, + "num_tokens": 691824616.0, + "step": 26735 + }, + { + "epoch": 2.9360860970788494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.55047869682312, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6988431215286255, + "num_tokens": 691847901.0, + "step": 26736 + }, + { + "epoch": 2.9361959147814627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.27101731300354, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7276822328567505, + "num_tokens": 691875459.0, + "step": 26737 + }, + { + "epoch": 2.9363057324840764, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3137102127075195, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7055731415748596, + "num_tokens": 691905817.0, + "step": 26738 + }, + { + "epoch": 2.93641555018669, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337451457977295, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7256539463996887, + "num_tokens": 691932060.0, + "step": 26739 + }, + { + "epoch": 2.9365253678893035, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.322500705718994, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6994098424911499, + "num_tokens": 691959423.0, + "step": 26740 + }, + { + "epoch": 2.9366351855919173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2665295600891113, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.718267560005188, + "num_tokens": 691989589.0, + "step": 26741 + }, + { + "epoch": 2.936745003294531, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4758965969085693, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7170294523239136, + "num_tokens": 692018265.0, + "step": 26742 + }, + { + "epoch": 2.936854820997145, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4236671924591064, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.715606689453125, + "num_tokens": 692044060.0, + "step": 26743 + }, + { + "epoch": 2.9369646386997585, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3114559650421143, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7185398936271667, + "num_tokens": 692071309.0, + "step": 26744 + }, + { + "epoch": 2.937074456402372, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5207629203796387, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.734094500541687, + "num_tokens": 692095403.0, + "step": 26745 + }, + { + "epoch": 2.9371842741049856, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4726784229278564, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7056361436843872, + "num_tokens": 692120477.0, + "step": 26746 + }, + { + "epoch": 2.9372940918075994, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.370715856552124, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7076910734176636, + "num_tokens": 692147261.0, + "step": 26747 + }, + { + "epoch": 2.937403909510213, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.546422004699707, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7009601593017578, + "num_tokens": 692170175.0, + "step": 26748 + }, + { + "epoch": 2.937513727212827, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0285587310791016, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.7008153200149536, + "num_tokens": 692207159.0, + "step": 26749 + }, + { + "epoch": 2.93762354491544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6861751079559326, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7304788827896118, + "num_tokens": 692228771.0, + "step": 26750 + }, + { + "epoch": 2.937733362618054, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5642998218536377, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7080644369125366, + "num_tokens": 692252062.0, + "step": 26751 + }, + { + "epoch": 2.9378431803206677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.633002281188965, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7236617803573608, + "num_tokens": 692276146.0, + "step": 26752 + }, + { + "epoch": 2.9379529980232815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4151229858398438, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.719973087310791, + "num_tokens": 692302206.0, + "step": 26753 + }, + { + "epoch": 2.9380628157258952, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.550830364227295, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7191303968429565, + "num_tokens": 692325652.0, + "step": 26754 + }, + { + "epoch": 2.9381726334285085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2431020736694336, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7036956548690796, + "num_tokens": 692355280.0, + "step": 26755 + }, + { + "epoch": 2.9382824511311223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4647085666656494, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7097227573394775, + "num_tokens": 692379783.0, + "step": 26756 + }, + { + "epoch": 2.938392268833736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539818048477173, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.717143177986145, + "num_tokens": 692402919.0, + "step": 26757 + }, + { + "epoch": 2.93850208653635, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.357008695602417, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7221725583076477, + "num_tokens": 692429074.0, + "step": 26758 + }, + { + "epoch": 2.9386119042389636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.633439064025879, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7159063816070557, + "num_tokens": 692452534.0, + "step": 26759 + }, + { + "epoch": 2.938721721941577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.574958086013794, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7216089963912964, + "num_tokens": 692477274.0, + "step": 26760 + }, + { + "epoch": 2.9388315396441906, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6717941761016846, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7228602170944214, + "num_tokens": 692500243.0, + "step": 26761 + }, + { + "epoch": 2.9389413573468044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.562791347503662, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7110103964805603, + "num_tokens": 692523281.0, + "step": 26762 + }, + { + "epoch": 2.9390511750494177, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3282155990600586, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7191264629364014, + "num_tokens": 692550813.0, + "step": 26763 + }, + { + "epoch": 2.9391609927520315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8893022537231445, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7195835709571838, + "num_tokens": 692571744.0, + "step": 26764 + }, + { + "epoch": 2.939270810454645, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.752070426940918, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7179261445999146, + "num_tokens": 692594231.0, + "step": 26765 + }, + { + "epoch": 2.939380628157259, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8014862537384033, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7234030961990356, + "num_tokens": 692614966.0, + "step": 26766 + }, + { + "epoch": 2.9394904458598727, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6573307514190674, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7167553305625916, + "num_tokens": 692638923.0, + "step": 26767 + }, + { + "epoch": 2.939600263562486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4435744285583496, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.7029529809951782, + "num_tokens": 692664177.0, + "step": 26768 + }, + { + "epoch": 2.9397100812651, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5389063358306885, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7222850918769836, + "num_tokens": 692686427.0, + "step": 26769 + }, + { + "epoch": 2.9398198989677136, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4117372035980225, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7079893946647644, + "num_tokens": 692714060.0, + "step": 26770 + }, + { + "epoch": 2.9399297166703273, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7692525386810303, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7556310296058655, + "num_tokens": 692735925.0, + "step": 26771 + }, + { + "epoch": 2.940039534372941, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.884935140609741, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7145353555679321, + "num_tokens": 692755877.0, + "step": 26772 + }, + { + "epoch": 2.9401493520755544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.595452070236206, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6954964399337769, + "num_tokens": 692779742.0, + "step": 26773 + }, + { + "epoch": 2.940259169778168, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.345616102218628, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.7039124965667725, + "num_tokens": 692807891.0, + "step": 26774 + }, + { + "epoch": 2.940368987480782, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8026375770568848, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7164936661720276, + "num_tokens": 692827979.0, + "step": 26775 + }, + { + "epoch": 2.9404788051833957, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4568727016448975, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6857925057411194, + "num_tokens": 692857900.0, + "step": 26776 + }, + { + "epoch": 2.9405886228860094, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.462332010269165, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6979736089706421, + "num_tokens": 692887200.0, + "step": 26777 + }, + { + "epoch": 2.9406984405886227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2361326217651367, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7210628986358643, + "num_tokens": 692916475.0, + "step": 26778 + }, + { + "epoch": 2.9408082582912365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4865293502807617, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.707547128200531, + "num_tokens": 692941391.0, + "step": 26779 + }, + { + "epoch": 2.9409180759938502, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3876779079437256, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.721413254737854, + "num_tokens": 692965948.0, + "step": 26780 + }, + { + "epoch": 2.941027893696464, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.741304636001587, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7168430089950562, + "num_tokens": 692988724.0, + "step": 26781 + }, + { + "epoch": 2.9411377113990778, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.698739528656006, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7138441801071167, + "num_tokens": 693010367.0, + "step": 26782 + }, + { + "epoch": 2.941247529101691, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.299886465072632, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7234585881233215, + "num_tokens": 693036425.0, + "step": 26783 + }, + { + "epoch": 2.941357346804305, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.029702663421631, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7235974669456482, + "num_tokens": 693055801.0, + "step": 26784 + }, + { + "epoch": 2.9414671645069186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8173115253448486, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7154723405838013, + "num_tokens": 693076977.0, + "step": 26785 + }, + { + "epoch": 2.9415769822095323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.239655017852783, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6960498094558716, + "num_tokens": 693105603.0, + "step": 26786 + }, + { + "epoch": 2.941686799912146, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4013819694519043, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7224392890930176, + "num_tokens": 693132176.0, + "step": 26787 + }, + { + "epoch": 2.9417966176147594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4788060188293457, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7254672050476074, + "num_tokens": 693158898.0, + "step": 26788 + }, + { + "epoch": 2.941906435317373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.541616439819336, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7277591228485107, + "num_tokens": 693181428.0, + "step": 26789 + }, + { + "epoch": 2.942016253019987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3940677642822266, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7146955728530884, + "num_tokens": 693209373.0, + "step": 26790 + }, + { + "epoch": 2.9421260707226002, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5479342937469482, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7317097187042236, + "num_tokens": 693233344.0, + "step": 26791 + }, + { + "epoch": 2.942235888425214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.459383726119995, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7184082865715027, + "num_tokens": 693256521.0, + "step": 26792 + }, + { + "epoch": 2.9423457061278278, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6127216815948486, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7182706594467163, + "num_tokens": 693282044.0, + "step": 26793 + }, + { + "epoch": 2.9424555238304415, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.406073808670044, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7157415747642517, + "num_tokens": 693306197.0, + "step": 26794 + }, + { + "epoch": 2.9425653415330553, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.369266986846924, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6980868577957153, + "num_tokens": 693333976.0, + "step": 26795 + }, + { + "epoch": 2.9426751592356686, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5684187412261963, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7233996391296387, + "num_tokens": 693355923.0, + "step": 26796 + }, + { + "epoch": 2.9427849769382823, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3019680976867676, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.706141471862793, + "num_tokens": 693383526.0, + "step": 26797 + }, + { + "epoch": 2.942894794640896, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.500988006591797, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7069103717803955, + "num_tokens": 693408827.0, + "step": 26798 + }, + { + "epoch": 2.94300461234351, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4256463050842285, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7196093797683716, + "num_tokens": 693434950.0, + "step": 26799 + }, + { + "epoch": 2.9431144300461236, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3676531314849854, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7156897187232971, + "num_tokens": 693461097.0, + "step": 26800 + }, + { + "epoch": 2.943224247748737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4289002418518066, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7036536931991577, + "num_tokens": 693486772.0, + "step": 26801 + }, + { + "epoch": 2.9433340654513507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.259246349334717, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7076320052146912, + "num_tokens": 693514765.0, + "step": 26802 + }, + { + "epoch": 2.9434438831539644, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.503765821456909, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.7033029198646545, + "num_tokens": 693538919.0, + "step": 26803 + }, + { + "epoch": 2.943553700856578, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5131428241729736, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7370724678039551, + "num_tokens": 693560016.0, + "step": 26804 + }, + { + "epoch": 2.943663518559192, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3088796138763428, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7044896483421326, + "num_tokens": 693590642.0, + "step": 26805 + }, + { + "epoch": 2.9437733362618053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5975193977355957, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.709667444229126, + "num_tokens": 693614369.0, + "step": 26806 + }, + { + "epoch": 2.943883153964419, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.278790235519409, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7118736505508423, + "num_tokens": 693643798.0, + "step": 26807 + }, + { + "epoch": 2.9439929716670328, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1040589809417725, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6939058303833008, + "num_tokens": 693676593.0, + "step": 26808 + }, + { + "epoch": 2.9441027893696465, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.690403938293457, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7247083187103271, + "num_tokens": 693698578.0, + "step": 26809 + }, + { + "epoch": 2.9442126070722603, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1470296382904053, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7163715958595276, + "num_tokens": 693728855.0, + "step": 26810 + }, + { + "epoch": 2.9443224247748736, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.236093282699585, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6858025193214417, + "num_tokens": 693757497.0, + "step": 26811 + }, + { + "epoch": 2.9444322424774874, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5221126079559326, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7224343419075012, + "num_tokens": 693780563.0, + "step": 26812 + }, + { + "epoch": 2.944542060180101, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2899656295776367, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7033132314682007, + "num_tokens": 693807523.0, + "step": 26813 + }, + { + "epoch": 2.9446518778827144, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5429039001464844, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7049480676651001, + "num_tokens": 693831195.0, + "step": 26814 + }, + { + "epoch": 2.9447616955853286, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.300631523132324, + "learning_rate": 1e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6940131187438965, + "num_tokens": 693860382.0, + "step": 26815 + }, + { + "epoch": 2.944871513287942, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.351865291595459, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7062378525733948, + "num_tokens": 693887384.0, + "step": 26816 + }, + { + "epoch": 2.9449813309905557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.489680290222168, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7249811887741089, + "num_tokens": 693910703.0, + "step": 26817 + }, + { + "epoch": 2.9450911486931695, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.9021472930908203, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6973211765289307, + "num_tokens": 693935462.0, + "step": 26818 + }, + { + "epoch": 2.9452009663957828, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.560147762298584, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7297636270523071, + "num_tokens": 693956186.0, + "step": 26819 + }, + { + "epoch": 2.9453107840983965, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2645413875579834, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7114938497543335, + "num_tokens": 693985511.0, + "step": 26820 + }, + { + "epoch": 2.9454206018010103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5124590396881104, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7375144958496094, + "num_tokens": 694008428.0, + "step": 26821 + }, + { + "epoch": 2.945530419503624, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.326977491378784, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7183902859687805, + "num_tokens": 694035928.0, + "step": 26822 + }, + { + "epoch": 2.945640237206238, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.440798044204712, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6965567469596863, + "num_tokens": 694061531.0, + "step": 26823 + }, + { + "epoch": 2.945750054908851, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5505051612854004, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7121160626411438, + "num_tokens": 694084998.0, + "step": 26824 + }, + { + "epoch": 2.945859872611465, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.659810781478882, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7207070589065552, + "num_tokens": 694106285.0, + "step": 26825 + }, + { + "epoch": 2.9459696903140786, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.743544101715088, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7084629535675049, + "num_tokens": 694135134.0, + "step": 26826 + }, + { + "epoch": 2.9460795080166924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.472630023956299, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7292004227638245, + "num_tokens": 694160220.0, + "step": 26827 + }, + { + "epoch": 2.946189325719306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6140453815460205, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7086606621742249, + "num_tokens": 694181966.0, + "step": 26828 + }, + { + "epoch": 2.9462991434219195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4017446041107178, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7158888578414917, + "num_tokens": 694207810.0, + "step": 26829 + }, + { + "epoch": 2.946408961124533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.772747278213501, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7085280418395996, + "num_tokens": 694238163.0, + "step": 26830 + }, + { + "epoch": 2.946518778827147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.59488844871521, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7319899797439575, + "num_tokens": 694260657.0, + "step": 26831 + }, + { + "epoch": 2.9466285965297607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2468550205230713, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6900525689125061, + "num_tokens": 694290003.0, + "step": 26832 + }, + { + "epoch": 2.9467384142323745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.415787696838379, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7133743762969971, + "num_tokens": 694315162.0, + "step": 26833 + }, + { + "epoch": 2.946848231934988, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2884299755096436, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7135348916053772, + "num_tokens": 694342414.0, + "step": 26834 + }, + { + "epoch": 2.9469580496376016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.477277994155884, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7320870161056519, + "num_tokens": 694367724.0, + "step": 26835 + }, + { + "epoch": 2.9470678673402153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6327016353607178, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7355413436889648, + "num_tokens": 694388463.0, + "step": 26836 + }, + { + "epoch": 2.947177685042829, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3156185150146484, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7265870571136475, + "num_tokens": 694415520.0, + "step": 26837 + }, + { + "epoch": 2.947287502745443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5888278484344482, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7030273079872131, + "num_tokens": 694438833.0, + "step": 26838 + }, + { + "epoch": 2.947397320448056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2838261127471924, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7297075986862183, + "num_tokens": 694466746.0, + "step": 26839 + }, + { + "epoch": 2.94750713815067, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.677708148956299, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7388978600502014, + "num_tokens": 694488620.0, + "step": 26840 + }, + { + "epoch": 2.9476169558532836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5940725803375244, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7055412530899048, + "num_tokens": 694513493.0, + "step": 26841 + }, + { + "epoch": 2.947726773555897, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6049373149871826, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7225647568702698, + "num_tokens": 694535953.0, + "step": 26842 + }, + { + "epoch": 2.9478365912585107, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3342859745025635, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7253051996231079, + "num_tokens": 694563545.0, + "step": 26843 + }, + { + "epoch": 2.9479464089611245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3156206607818604, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7194510102272034, + "num_tokens": 694592406.0, + "step": 26844 + }, + { + "epoch": 2.9480562266637382, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.575828790664673, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.7033035755157471, + "num_tokens": 694615286.0, + "step": 26845 + }, + { + "epoch": 2.948166044366352, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3355350494384766, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6922610998153687, + "num_tokens": 694643591.0, + "step": 26846 + }, + { + "epoch": 2.9482758620689653, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.690419912338257, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7236050367355347, + "num_tokens": 694664116.0, + "step": 26847 + }, + { + "epoch": 2.948385679771579, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4920687675476074, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7137623429298401, + "num_tokens": 694688798.0, + "step": 26848 + }, + { + "epoch": 2.948495497474193, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4856128692626953, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7396911382675171, + "num_tokens": 694710847.0, + "step": 26849 + }, + { + "epoch": 2.9486053151768066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.790208578109741, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7238399982452393, + "num_tokens": 694729400.0, + "step": 26850 + }, + { + "epoch": 2.9487151328794203, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.251713752746582, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.692897379398346, + "num_tokens": 694760140.0, + "step": 26851 + }, + { + "epoch": 2.9488249505820336, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.640381097793579, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7037983536720276, + "num_tokens": 694782721.0, + "step": 26852 + }, + { + "epoch": 2.9489347682846474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.386937141418457, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7180180549621582, + "num_tokens": 694807509.0, + "step": 26853 + }, + { + "epoch": 2.949044585987261, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4379379749298096, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7161846160888672, + "num_tokens": 694833524.0, + "step": 26854 + }, + { + "epoch": 2.949154403689875, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4848990440368652, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7466757297515869, + "num_tokens": 694855875.0, + "step": 26855 + }, + { + "epoch": 2.9492642213924887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.705348491668701, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7126359343528748, + "num_tokens": 694877608.0, + "step": 26856 + }, + { + "epoch": 2.949374039095102, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4088199138641357, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.718161404132843, + "num_tokens": 694901942.0, + "step": 26857 + }, + { + "epoch": 2.9494838567977157, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.540085792541504, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7243660688400269, + "num_tokens": 694925693.0, + "step": 26858 + }, + { + "epoch": 2.9495936745003295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2610068321228027, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7139257192611694, + "num_tokens": 694957187.0, + "step": 26859 + }, + { + "epoch": 2.9497034922029433, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5081114768981934, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7105110287666321, + "num_tokens": 694982495.0, + "step": 26860 + }, + { + "epoch": 2.949813309905557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.648847818374634, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7236552834510803, + "num_tokens": 695006353.0, + "step": 26861 + }, + { + "epoch": 2.9499231276081703, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4357686042785645, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7237815856933594, + "num_tokens": 695030589.0, + "step": 26862 + }, + { + "epoch": 2.950032945310784, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6601815223693848, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.7049071788787842, + "num_tokens": 695055152.0, + "step": 26863 + }, + { + "epoch": 2.950142763013398, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7006115913391113, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.722036600112915, + "num_tokens": 695077887.0, + "step": 26864 + }, + { + "epoch": 2.950252580716011, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2476320266723633, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7007601261138916, + "num_tokens": 695108433.0, + "step": 26865 + }, + { + "epoch": 2.9503623984186254, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4657864570617676, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7085298895835876, + "num_tokens": 695134430.0, + "step": 26866 + }, + { + "epoch": 2.9504722161212387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6087124347686768, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7131365537643433, + "num_tokens": 695157626.0, + "step": 26867 + }, + { + "epoch": 2.9505820338238524, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3056299686431885, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6991654634475708, + "num_tokens": 695185852.0, + "step": 26868 + }, + { + "epoch": 2.950691851526466, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.331174850463867, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7082504034042358, + "num_tokens": 695213099.0, + "step": 26869 + }, + { + "epoch": 2.9508016692290795, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.49039888381958, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.698091447353363, + "num_tokens": 695239514.0, + "step": 26870 + }, + { + "epoch": 2.9509114869316933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.328322172164917, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7035403847694397, + "num_tokens": 695267943.0, + "step": 26871 + }, + { + "epoch": 2.951021304634307, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9460740089416504, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7491569519042969, + "num_tokens": 695287393.0, + "step": 26872 + }, + { + "epoch": 2.9511311223369208, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.53108549118042, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7162666320800781, + "num_tokens": 695310829.0, + "step": 26873 + }, + { + "epoch": 2.9512409400395345, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.470623254776001, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7278445363044739, + "num_tokens": 695335851.0, + "step": 26874 + }, + { + "epoch": 2.951350757742148, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4641811847686768, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7175756692886353, + "num_tokens": 695359724.0, + "step": 26875 + }, + { + "epoch": 2.9514605754447616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.47847318649292, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7057157158851624, + "num_tokens": 695383927.0, + "step": 26876 + }, + { + "epoch": 2.9515703931473753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1818418502807617, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7156790494918823, + "num_tokens": 695415377.0, + "step": 26877 + }, + { + "epoch": 2.951680210849989, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.491269588470459, + "learning_rate": 1e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6926720142364502, + "num_tokens": 695444063.0, + "step": 26878 + }, + { + "epoch": 2.951790028552603, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.623314619064331, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.721045970916748, + "num_tokens": 695468197.0, + "step": 26879 + }, + { + "epoch": 2.951899846255216, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.158766984939575, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6885011196136475, + "num_tokens": 695498017.0, + "step": 26880 + }, + { + "epoch": 2.95200966395783, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.449151039123535, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6875261068344116, + "num_tokens": 695523901.0, + "step": 26881 + }, + { + "epoch": 2.9521194816604437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.801649808883667, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7167086601257324, + "num_tokens": 695552061.0, + "step": 26882 + }, + { + "epoch": 2.9522292993630574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.300459623336792, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6858007311820984, + "num_tokens": 695582133.0, + "step": 26883 + }, + { + "epoch": 2.952339117065671, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.359647512435913, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7139389514923096, + "num_tokens": 695609439.0, + "step": 26884 + }, + { + "epoch": 2.9524489347682845, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5815539360046387, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.7115444540977478, + "num_tokens": 695634479.0, + "step": 26885 + }, + { + "epoch": 2.9525587524708983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.429436445236206, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7030161619186401, + "num_tokens": 695659018.0, + "step": 26886 + }, + { + "epoch": 2.952668570173512, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2492096424102783, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7114222049713135, + "num_tokens": 695685941.0, + "step": 26887 + }, + { + "epoch": 2.952778387876126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7512145042419434, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7092767953872681, + "num_tokens": 695706360.0, + "step": 26888 + }, + { + "epoch": 2.9528882055787395, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2777397632598877, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7241093516349792, + "num_tokens": 695736570.0, + "step": 26889 + }, + { + "epoch": 2.952998023281353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.674166679382324, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7202102541923523, + "num_tokens": 695758780.0, + "step": 26890 + }, + { + "epoch": 2.9531078409839666, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.592430830001831, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7106279134750366, + "num_tokens": 695781814.0, + "step": 26891 + }, + { + "epoch": 2.9532176586865804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1539840698242188, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7197737693786621, + "num_tokens": 695816254.0, + "step": 26892 + }, + { + "epoch": 2.9533274763891937, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.490560531616211, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.710148811340332, + "num_tokens": 695841352.0, + "step": 26893 + }, + { + "epoch": 2.9534372940918074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.364036798477173, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7047486901283264, + "num_tokens": 695869778.0, + "step": 26894 + }, + { + "epoch": 2.953547111794421, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.460400104522705, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7077858448028564, + "num_tokens": 695895169.0, + "step": 26895 + }, + { + "epoch": 2.953656929497035, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0960822105407715, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.726033627986908, + "num_tokens": 695929771.0, + "step": 26896 + }, + { + "epoch": 2.9537667471996487, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5713274478912354, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.719801664352417, + "num_tokens": 695952223.0, + "step": 26897 + }, + { + "epoch": 2.953876564902262, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.219716787338257, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.7003487348556519, + "num_tokens": 695983239.0, + "step": 26898 + }, + { + "epoch": 2.953986382604876, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.478743553161621, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7127363681793213, + "num_tokens": 696007787.0, + "step": 26899 + }, + { + "epoch": 2.9540962003074895, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5599451065063477, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7141867280006409, + "num_tokens": 696032528.0, + "step": 26900 + }, + { + "epoch": 2.9542060180101033, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3499414920806885, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6941006183624268, + "num_tokens": 696061706.0, + "step": 26901 + }, + { + "epoch": 2.954315835712717, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.326472520828247, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7146486043930054, + "num_tokens": 696090386.0, + "step": 26902 + }, + { + "epoch": 2.9544256534153304, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4159610271453857, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7224370241165161, + "num_tokens": 696114610.0, + "step": 26903 + }, + { + "epoch": 2.954535471117944, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.035400867462158, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7217049598693848, + "num_tokens": 696147123.0, + "step": 26904 + }, + { + "epoch": 2.954645288820558, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.216531991958618, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.718364953994751, + "num_tokens": 696176382.0, + "step": 26905 + }, + { + "epoch": 2.9547551065231716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.289231538772583, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7112904191017151, + "num_tokens": 696207007.0, + "step": 26906 + }, + { + "epoch": 2.9548649242257854, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.448509454727173, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.715736448764801, + "num_tokens": 696233648.0, + "step": 26907 + }, + { + "epoch": 2.9549747419283987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3417255878448486, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.715284526348114, + "num_tokens": 696261850.0, + "step": 26908 + }, + { + "epoch": 2.9550845596310125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7025086879730225, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7302988767623901, + "num_tokens": 696283083.0, + "step": 26909 + }, + { + "epoch": 2.955194377333626, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2181880474090576, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6838547587394714, + "num_tokens": 696318250.0, + "step": 26910 + }, + { + "epoch": 2.95530419503624, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.721479892730713, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7232282161712646, + "num_tokens": 696338696.0, + "step": 26911 + }, + { + "epoch": 2.9554140127388537, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.783006429672241, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7232524156570435, + "num_tokens": 696359347.0, + "step": 26912 + }, + { + "epoch": 2.955523830441467, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.782721996307373, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7305325865745544, + "num_tokens": 696379979.0, + "step": 26913 + }, + { + "epoch": 2.955633648144081, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.475867986679077, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6987631320953369, + "num_tokens": 696410744.0, + "step": 26914 + }, + { + "epoch": 2.9557434658466946, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.297776699066162, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7071233987808228, + "num_tokens": 696442078.0, + "step": 26915 + }, + { + "epoch": 2.955853283549308, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.565208673477173, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7189432978630066, + "num_tokens": 696464422.0, + "step": 26916 + }, + { + "epoch": 2.955963101251922, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.614880084991455, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7218322157859802, + "num_tokens": 696488774.0, + "step": 26917 + }, + { + "epoch": 2.9560729189545354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3437798023223877, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.7000482082366943, + "num_tokens": 696516657.0, + "step": 26918 + }, + { + "epoch": 2.956182736657149, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.833496570587158, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7129433155059814, + "num_tokens": 696537618.0, + "step": 26919 + }, + { + "epoch": 2.956292554359763, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.951951503753662, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7325147390365601, + "num_tokens": 696562197.0, + "step": 26920 + }, + { + "epoch": 2.956402372062376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.521073341369629, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7080516815185547, + "num_tokens": 696588380.0, + "step": 26921 + }, + { + "epoch": 2.95651218976499, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4315202236175537, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7111165523529053, + "num_tokens": 696612837.0, + "step": 26922 + }, + { + "epoch": 2.9566220074676037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.586063861846924, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7164007425308228, + "num_tokens": 696633842.0, + "step": 26923 + }, + { + "epoch": 2.9567318251702175, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.385986328125, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7109967470169067, + "num_tokens": 696660609.0, + "step": 26924 + }, + { + "epoch": 2.9568416428728312, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.397725820541382, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.7030335068702698, + "num_tokens": 696688257.0, + "step": 26925 + }, + { + "epoch": 2.9569514605754446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.499253749847412, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7266305088996887, + "num_tokens": 696712726.0, + "step": 26926 + }, + { + "epoch": 2.9570612782780583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1975014209747314, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7183200120925903, + "num_tokens": 696743551.0, + "step": 26927 + }, + { + "epoch": 2.957171095980672, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3935046195983887, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7291224598884583, + "num_tokens": 696772162.0, + "step": 26928 + }, + { + "epoch": 2.957280913683286, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3892571926116943, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7270204424858093, + "num_tokens": 696796890.0, + "step": 26929 + }, + { + "epoch": 2.9573907313858996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5635781288146973, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7080467343330383, + "num_tokens": 696821425.0, + "step": 26930 + }, + { + "epoch": 2.957500549088513, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3847830295562744, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7094931602478027, + "num_tokens": 696849118.0, + "step": 26931 + }, + { + "epoch": 2.9576103667911267, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4114551544189453, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7037450075149536, + "num_tokens": 696875342.0, + "step": 26932 + }, + { + "epoch": 2.9577201844937404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.488398313522339, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7109900712966919, + "num_tokens": 696900139.0, + "step": 26933 + }, + { + "epoch": 2.957830002196354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4642622470855713, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7018852233886719, + "num_tokens": 696925544.0, + "step": 26934 + }, + { + "epoch": 2.957939819898968, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3962008953094482, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7110324501991272, + "num_tokens": 696957200.0, + "step": 26935 + }, + { + "epoch": 2.9580496376015812, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.506442070007324, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.7028610110282898, + "num_tokens": 696984092.0, + "step": 26936 + }, + { + "epoch": 2.958159455304195, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68380069732666, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7266180515289307, + "num_tokens": 697009435.0, + "step": 26937 + }, + { + "epoch": 2.9582692730068088, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.155940055847168, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7224675416946411, + "num_tokens": 697040127.0, + "step": 26938 + }, + { + "epoch": 2.9583790907094225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.175809621810913, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7140689492225647, + "num_tokens": 697068990.0, + "step": 26939 + }, + { + "epoch": 2.9584889084120363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.473031759262085, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7096706628799438, + "num_tokens": 697093253.0, + "step": 26940 + }, + { + "epoch": 2.9585987261146496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1205639839172363, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7001937031745911, + "num_tokens": 697127762.0, + "step": 26941 + }, + { + "epoch": 2.9587085438172633, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.463284492492676, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7000442147254944, + "num_tokens": 697152355.0, + "step": 26942 + }, + { + "epoch": 2.958818361519877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4113056659698486, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7337608337402344, + "num_tokens": 697177570.0, + "step": 26943 + }, + { + "epoch": 2.9589281792224904, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3300702571868896, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7456324100494385, + "num_tokens": 697203614.0, + "step": 26944 + }, + { + "epoch": 2.959037996925104, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5249524116516113, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7302623987197876, + "num_tokens": 697227800.0, + "step": 26945 + }, + { + "epoch": 2.959147814627718, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.651973009109497, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7142074108123779, + "num_tokens": 697249865.0, + "step": 26946 + }, + { + "epoch": 2.9592576323303317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.592750310897827, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.713538646697998, + "num_tokens": 697272909.0, + "step": 26947 + }, + { + "epoch": 2.9593674500329454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2496588230133057, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7323925495147705, + "num_tokens": 697300674.0, + "step": 26948 + }, + { + "epoch": 2.9594772677355587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3866331577301025, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7464439868927002, + "num_tokens": 697327265.0, + "step": 26949 + }, + { + "epoch": 2.9595870854381725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.332279920578003, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7406005859375, + "num_tokens": 697352568.0, + "step": 26950 + }, + { + "epoch": 2.9596969031407863, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.18051815032959, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7092335224151611, + "num_tokens": 697383626.0, + "step": 26951 + }, + { + "epoch": 2.9598067208434, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6510050296783447, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7153396010398865, + "num_tokens": 697406230.0, + "step": 26952 + }, + { + "epoch": 2.9599165385460138, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.653074026107788, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7243239283561707, + "num_tokens": 697428583.0, + "step": 26953 + }, + { + "epoch": 2.960026356248627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9529964923858643, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7134718894958496, + "num_tokens": 697450490.0, + "step": 26954 + }, + { + "epoch": 2.960136173951241, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6009976863861084, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7010990381240845, + "num_tokens": 697474413.0, + "step": 26955 + }, + { + "epoch": 2.9602459916538546, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.441342830657959, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6910547614097595, + "num_tokens": 697500328.0, + "step": 26956 + }, + { + "epoch": 2.9603558093564684, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2700181007385254, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7299025654792786, + "num_tokens": 697531539.0, + "step": 26957 + }, + { + "epoch": 2.960465627059082, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4030940532684326, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6956531405448914, + "num_tokens": 697557611.0, + "step": 26958 + }, + { + "epoch": 2.9605754447616954, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.384782314300537, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7073889970779419, + "num_tokens": 697584171.0, + "step": 26959 + }, + { + "epoch": 2.960685262464309, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4314539432525635, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7146438956260681, + "num_tokens": 697610562.0, + "step": 26960 + }, + { + "epoch": 2.960795080166923, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6631617546081543, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7215997576713562, + "num_tokens": 697632479.0, + "step": 26961 + }, + { + "epoch": 2.9609048978695367, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3560383319854736, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7093061208724976, + "num_tokens": 697659337.0, + "step": 26962 + }, + { + "epoch": 2.9610147155721505, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.152594566345215, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7177265882492065, + "num_tokens": 697692917.0, + "step": 26963 + }, + { + "epoch": 2.9611245332747638, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.197629690170288, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.7049872875213623, + "num_tokens": 697722169.0, + "step": 26964 + }, + { + "epoch": 2.9612343509773775, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.40649151802063, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7122130393981934, + "num_tokens": 697750091.0, + "step": 26965 + }, + { + "epoch": 2.9613441686799913, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.464303493499756, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7024235725402832, + "num_tokens": 697776446.0, + "step": 26966 + }, + { + "epoch": 2.961453986382605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8093080520629883, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7065589427947998, + "num_tokens": 697796895.0, + "step": 26967 + }, + { + "epoch": 2.961563804085219, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.370344400405884, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6936702728271484, + "num_tokens": 697827015.0, + "step": 26968 + }, + { + "epoch": 2.961673621787832, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7528793811798096, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.742219865322113, + "num_tokens": 697849556.0, + "step": 26969 + }, + { + "epoch": 2.961783439490446, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5366735458374023, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7317153811454773, + "num_tokens": 697873031.0, + "step": 26970 + }, + { + "epoch": 2.9618932571930596, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.465162992477417, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7086065411567688, + "num_tokens": 697902202.0, + "step": 26971 + }, + { + "epoch": 2.962003074895673, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3760318756103516, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.723243772983551, + "num_tokens": 697929995.0, + "step": 26972 + }, + { + "epoch": 2.9621128925982867, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.587674379348755, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7239153981208801, + "num_tokens": 697954809.0, + "step": 26973 + }, + { + "epoch": 2.9622227103009005, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5629148483276367, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6868213415145874, + "num_tokens": 697982219.0, + "step": 26974 + }, + { + "epoch": 2.962332528003514, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3655712604522705, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7073541283607483, + "num_tokens": 698010780.0, + "step": 26975 + }, + { + "epoch": 2.962442345706128, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.012645244598389, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.722547709941864, + "num_tokens": 698036413.0, + "step": 26976 + }, + { + "epoch": 2.9625521634087413, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3576226234436035, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7256420850753784, + "num_tokens": 698062900.0, + "step": 26977 + }, + { + "epoch": 2.962661981111355, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7068374156951904, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7038009166717529, + "num_tokens": 698087121.0, + "step": 26978 + }, + { + "epoch": 2.962771798813969, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 32.61385726928711, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6937522888183594, + "num_tokens": 698113106.0, + "step": 26979 + }, + { + "epoch": 2.9628816165165826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7171878814697266, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7273581027984619, + "num_tokens": 698136953.0, + "step": 26980 + }, + { + "epoch": 2.9629914342191963, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6531600952148438, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6915853023529053, + "num_tokens": 698160688.0, + "step": 26981 + }, + { + "epoch": 2.9631012519218096, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4945619106292725, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7201567888259888, + "num_tokens": 698186106.0, + "step": 26982 + }, + { + "epoch": 2.9632110696244234, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2925589084625244, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.717770516872406, + "num_tokens": 698213086.0, + "step": 26983 + }, + { + "epoch": 2.963320887327037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1200735569000244, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7313715815544128, + "num_tokens": 698243181.0, + "step": 26984 + }, + { + "epoch": 2.963430705029651, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5278873443603516, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7234119772911072, + "num_tokens": 698267595.0, + "step": 26985 + }, + { + "epoch": 2.9635405227322646, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5355844497680664, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7293594479560852, + "num_tokens": 698291108.0, + "step": 26986 + }, + { + "epoch": 2.963650340434878, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2555301189422607, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7182959318161011, + "num_tokens": 698321436.0, + "step": 26987 + }, + { + "epoch": 2.9637601581374917, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.457990884780884, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.713354766368866, + "num_tokens": 698348173.0, + "step": 26988 + }, + { + "epoch": 2.9638699758401055, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.159010887145996, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6908048391342163, + "num_tokens": 698382618.0, + "step": 26989 + }, + { + "epoch": 2.9639797935427192, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.0191245079040527, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.7055909037590027, + "num_tokens": 698419430.0, + "step": 26990 + }, + { + "epoch": 2.964089611245333, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.626737356185913, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7305443286895752, + "num_tokens": 698440289.0, + "step": 26991 + }, + { + "epoch": 2.9641994289479463, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.360589027404785, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.713305652141571, + "num_tokens": 698468329.0, + "step": 26992 + }, + { + "epoch": 2.96430924665056, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3045332431793213, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7188080549240112, + "num_tokens": 698493506.0, + "step": 26993 + }, + { + "epoch": 2.964419064353174, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8481945991516113, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7307195663452148, + "num_tokens": 698512949.0, + "step": 26994 + }, + { + "epoch": 2.964528882055787, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3899426460266113, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7044833898544312, + "num_tokens": 698540485.0, + "step": 26995 + }, + { + "epoch": 2.9646386997584013, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.407801628112793, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6914224028587341, + "num_tokens": 698568590.0, + "step": 26996 + }, + { + "epoch": 2.9647485174610146, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.371752977371216, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7145135402679443, + "num_tokens": 698595857.0, + "step": 26997 + }, + { + "epoch": 2.9648583351636284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.130678653717041, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6931791305541992, + "num_tokens": 698630604.0, + "step": 26998 + }, + { + "epoch": 2.964968152866242, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4915292263031006, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7317381501197815, + "num_tokens": 698655461.0, + "step": 26999 + }, + { + "epoch": 2.9650779705688555, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.161769151687622, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6931299567222595, + "num_tokens": 698690583.0, + "step": 27000 + }, + { + "epoch": 2.9651877882714692, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3695833683013916, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.69503253698349, + "num_tokens": 698717539.0, + "step": 27001 + }, + { + "epoch": 2.965297605974083, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7279112339019775, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7336258888244629, + "num_tokens": 698740933.0, + "step": 27002 + }, + { + "epoch": 2.9654074236766967, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3968992233276367, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7179898619651794, + "num_tokens": 698767324.0, + "step": 27003 + }, + { + "epoch": 2.9655172413793105, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.376161575317383, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7070434093475342, + "num_tokens": 698795914.0, + "step": 27004 + }, + { + "epoch": 2.965627059081924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5046772956848145, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7100033760070801, + "num_tokens": 698821613.0, + "step": 27005 + }, + { + "epoch": 2.9657368767845376, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.780900001525879, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7083640098571777, + "num_tokens": 698842556.0, + "step": 27006 + }, + { + "epoch": 2.9658466944871513, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4062821865081787, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7124795913696289, + "num_tokens": 698871271.0, + "step": 27007 + }, + { + "epoch": 2.965956512189765, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4441134929656982, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6960759162902832, + "num_tokens": 698897929.0, + "step": 27008 + }, + { + "epoch": 2.966066329892379, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.512474775314331, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6976386904716492, + "num_tokens": 698922956.0, + "step": 27009 + }, + { + "epoch": 2.966176147594992, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.152621030807495, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7281885147094727, + "num_tokens": 698952848.0, + "step": 27010 + }, + { + "epoch": 2.966285965297606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5325870513916016, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7198197841644287, + "num_tokens": 698977153.0, + "step": 27011 + }, + { + "epoch": 2.9663957830002197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.700315475463867, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7495633363723755, + "num_tokens": 698996246.0, + "step": 27012 + }, + { + "epoch": 2.9665056007028334, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6162238121032715, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7165733575820923, + "num_tokens": 699018202.0, + "step": 27013 + }, + { + "epoch": 2.966615418405447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6687026023864746, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7124577760696411, + "num_tokens": 699039269.0, + "step": 27014 + }, + { + "epoch": 2.9667252361080605, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3036980628967285, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.70576411485672, + "num_tokens": 699067947.0, + "step": 27015 + }, + { + "epoch": 2.9668350538106742, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4554057121276855, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7102648615837097, + "num_tokens": 699093164.0, + "step": 27016 + }, + { + "epoch": 2.966944871513288, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5696611404418945, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7354021072387695, + "num_tokens": 699116466.0, + "step": 27017 + }, + { + "epoch": 2.9670546892159018, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6306228637695312, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7395143508911133, + "num_tokens": 699138646.0, + "step": 27018 + }, + { + "epoch": 2.9671645069185155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.511805295944214, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7179733514785767, + "num_tokens": 699163571.0, + "step": 27019 + }, + { + "epoch": 2.967274324621129, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2509238719940186, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.7029149532318115, + "num_tokens": 699194222.0, + "step": 27020 + }, + { + "epoch": 2.9673841423237426, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.19844388961792, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.709092378616333, + "num_tokens": 699225044.0, + "step": 27021 + }, + { + "epoch": 2.9674939600263563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2837469577789307, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6965314149856567, + "num_tokens": 699254793.0, + "step": 27022 + }, + { + "epoch": 2.9676037777289697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6477549076080322, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7146813869476318, + "num_tokens": 699276903.0, + "step": 27023 + }, + { + "epoch": 2.9677135954315834, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3397274017333984, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7292333245277405, + "num_tokens": 699306065.0, + "step": 27024 + }, + { + "epoch": 2.967823413134197, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.778986930847168, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.7079604864120483, + "num_tokens": 699329481.0, + "step": 27025 + }, + { + "epoch": 2.967933230836811, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1520352363586426, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6975182294845581, + "num_tokens": 699361293.0, + "step": 27026 + }, + { + "epoch": 2.9680430485394247, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7810134887695312, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7254791259765625, + "num_tokens": 699387539.0, + "step": 27027 + }, + { + "epoch": 2.968152866242038, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.760350227355957, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7215548157691956, + "num_tokens": 699407609.0, + "step": 27028 + }, + { + "epoch": 2.9682626839446518, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3426363468170166, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7069189548492432, + "num_tokens": 699435534.0, + "step": 27029 + }, + { + "epoch": 2.9683725016472655, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4413390159606934, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7131731510162354, + "num_tokens": 699460880.0, + "step": 27030 + }, + { + "epoch": 2.9684823193498793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.640484094619751, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7359343767166138, + "num_tokens": 699481884.0, + "step": 27031 + }, + { + "epoch": 2.968592137052493, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6405014991760254, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7168985605239868, + "num_tokens": 699506015.0, + "step": 27032 + }, + { + "epoch": 2.9687019547551063, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4350903034210205, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7024997472763062, + "num_tokens": 699532009.0, + "step": 27033 + }, + { + "epoch": 2.96881177245772, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.561563730239868, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7195731401443481, + "num_tokens": 699553030.0, + "step": 27034 + }, + { + "epoch": 2.968921590160334, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4229037761688232, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7222298383712769, + "num_tokens": 699579827.0, + "step": 27035 + }, + { + "epoch": 2.9690314078629476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3220770359039307, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7129238843917847, + "num_tokens": 699606097.0, + "step": 27036 + }, + { + "epoch": 2.9691412255655614, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4467155933380127, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7269769906997681, + "num_tokens": 699628139.0, + "step": 27037 + }, + { + "epoch": 2.9692510432681747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.352133274078369, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7103389501571655, + "num_tokens": 699656894.0, + "step": 27038 + }, + { + "epoch": 2.9693608609707884, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5632541179656982, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7079142332077026, + "num_tokens": 699682476.0, + "step": 27039 + }, + { + "epoch": 2.969470678673402, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.490300178527832, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7061495780944824, + "num_tokens": 699708794.0, + "step": 27040 + }, + { + "epoch": 2.969580496376016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6095497608184814, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7243321537971497, + "num_tokens": 699731205.0, + "step": 27041 + }, + { + "epoch": 2.9696903140786297, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4200830459594727, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7221616506576538, + "num_tokens": 699757450.0, + "step": 27042 + }, + { + "epoch": 2.969800131781243, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.588545560836792, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7305878400802612, + "num_tokens": 699778921.0, + "step": 27043 + }, + { + "epoch": 2.969909949483857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2859604358673096, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7151837348937988, + "num_tokens": 699808168.0, + "step": 27044 + }, + { + "epoch": 2.9700197671864705, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2337331771850586, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.711399495601654, + "num_tokens": 699838825.0, + "step": 27045 + }, + { + "epoch": 2.970129584889084, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.331348419189453, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.7071031332015991, + "num_tokens": 699866348.0, + "step": 27046 + }, + { + "epoch": 2.970239402591698, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.546757698059082, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7284224629402161, + "num_tokens": 699889755.0, + "step": 27047 + }, + { + "epoch": 2.9703492202943114, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 7.024721145629883, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.692855715751648, + "num_tokens": 699922208.0, + "step": 27048 + }, + { + "epoch": 2.970459037996925, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.898861885070801, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7104192972183228, + "num_tokens": 699941048.0, + "step": 27049 + }, + { + "epoch": 2.970568855699539, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3310420513153076, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6998102068901062, + "num_tokens": 699969241.0, + "step": 27050 + }, + { + "epoch": 2.970678673402152, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4504234790802, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7174593210220337, + "num_tokens": 699993630.0, + "step": 27051 + }, + { + "epoch": 2.970788491104766, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3766064643859863, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7205777764320374, + "num_tokens": 700020713.0, + "step": 27052 + }, + { + "epoch": 2.9708983088073797, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 2.596158027648926, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7236945033073425, + "num_tokens": 700043333.0, + "step": 27053 + }, + { + "epoch": 2.9710081265099935, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 2.324267625808716, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7035006284713745, + "num_tokens": 700071824.0, + "step": 27054 + }, + { + "epoch": 2.971117944212607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.343547821044922, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7326626777648926, + "num_tokens": 700096285.0, + "step": 27055 + }, + { + "epoch": 2.9712277619152205, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.427004098892212, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7079945802688599, + "num_tokens": 700122349.0, + "step": 27056 + }, + { + "epoch": 2.9713375796178343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.204707145690918, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7156659960746765, + "num_tokens": 700152605.0, + "step": 27057 + }, + { + "epoch": 2.971447397320448, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5942485332489014, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7377429008483887, + "num_tokens": 700173611.0, + "step": 27058 + }, + { + "epoch": 2.971557215023062, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.246126890182495, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7068219184875488, + "num_tokens": 700205386.0, + "step": 27059 + }, + { + "epoch": 2.9716670327256756, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2318053245544434, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7168956995010376, + "num_tokens": 700235046.0, + "step": 27060 + }, + { + "epoch": 2.971776850428289, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324396848678589, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7070543169975281, + "num_tokens": 700265594.0, + "step": 27061 + }, + { + "epoch": 2.9718866681309026, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.036562442779541, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7302842140197754, + "num_tokens": 700284713.0, + "step": 27062 + }, + { + "epoch": 2.9719964858335164, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.488640308380127, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7134544849395752, + "num_tokens": 700311304.0, + "step": 27063 + }, + { + "epoch": 2.97210630353613, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5920748710632324, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.718450665473938, + "num_tokens": 700334528.0, + "step": 27064 + }, + { + "epoch": 2.972216121238744, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.317657709121704, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7036924958229065, + "num_tokens": 700365145.0, + "step": 27065 + }, + { + "epoch": 2.972325938941357, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2197561264038086, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.721671998500824, + "num_tokens": 700394174.0, + "step": 27066 + }, + { + "epoch": 2.972435756643971, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2669076919555664, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7203444838523865, + "num_tokens": 700422187.0, + "step": 27067 + }, + { + "epoch": 2.9725455743465847, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4184560775756836, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6840760707855225, + "num_tokens": 700449559.0, + "step": 27068 + }, + { + "epoch": 2.9726553920491985, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.423523187637329, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.7028225660324097, + "num_tokens": 700479833.0, + "step": 27069 + }, + { + "epoch": 2.9727652097518122, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5979971885681152, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7280364632606506, + "num_tokens": 700501218.0, + "step": 27070 + }, + { + "epoch": 2.9728750274544256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.340146541595459, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6879750490188599, + "num_tokens": 700529618.0, + "step": 27071 + }, + { + "epoch": 2.9729848451570393, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3453385829925537, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7169001698493958, + "num_tokens": 700555811.0, + "step": 27072 + }, + { + "epoch": 2.973094662859653, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.9005205631256104, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7507238984107971, + "num_tokens": 700574821.0, + "step": 27073 + }, + { + "epoch": 2.9732044805622664, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.526268482208252, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.704461932182312, + "num_tokens": 700599791.0, + "step": 27074 + }, + { + "epoch": 2.97331429826488, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.43707275390625, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7182483077049255, + "num_tokens": 700627048.0, + "step": 27075 + }, + { + "epoch": 2.973424115967494, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3908960819244385, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7007614970207214, + "num_tokens": 700656950.0, + "step": 27076 + }, + { + "epoch": 2.9735339336701077, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.340634822845459, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6996715068817139, + "num_tokens": 700686909.0, + "step": 27077 + }, + { + "epoch": 2.9736437513727214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.531714677810669, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7320131659507751, + "num_tokens": 700711077.0, + "step": 27078 + }, + { + "epoch": 2.9737535690753347, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4273221492767334, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7149591445922852, + "num_tokens": 700737323.0, + "step": 27079 + }, + { + "epoch": 2.9738633867779485, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4311184883117676, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7093061208724976, + "num_tokens": 700763459.0, + "step": 27080 + }, + { + "epoch": 2.9739732044805622, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.319681406021118, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.743511974811554, + "num_tokens": 700790020.0, + "step": 27081 + }, + { + "epoch": 2.974083022183176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.682279348373413, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7493252158164978, + "num_tokens": 700810259.0, + "step": 27082 + }, + { + "epoch": 2.9741928398857898, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.339840888977051, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7024896144866943, + "num_tokens": 700837021.0, + "step": 27083 + }, + { + "epoch": 2.974302657588403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.279125690460205, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7192496657371521, + "num_tokens": 700864837.0, + "step": 27084 + }, + { + "epoch": 2.974412475291017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.334479331970215, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7150858640670776, + "num_tokens": 700893517.0, + "step": 27085 + }, + { + "epoch": 2.9745222929936306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5203447341918945, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7011924982070923, + "num_tokens": 700918292.0, + "step": 27086 + }, + { + "epoch": 2.9746321106962443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4591567516326904, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.71970534324646, + "num_tokens": 700942703.0, + "step": 27087 + }, + { + "epoch": 2.974741928398858, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6195878982543945, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.72733473777771, + "num_tokens": 700966432.0, + "step": 27088 + }, + { + "epoch": 2.9748517461014714, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3009705543518066, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.715246319770813, + "num_tokens": 700994539.0, + "step": 27089 + }, + { + "epoch": 2.974961563804085, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.085782051086426, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7044194936752319, + "num_tokens": 701028518.0, + "step": 27090 + }, + { + "epoch": 2.975071381506699, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4263155460357666, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7222627401351929, + "num_tokens": 701053782.0, + "step": 27091 + }, + { + "epoch": 2.9751811992093127, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.956774950027466, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7340242862701416, + "num_tokens": 701077793.0, + "step": 27092 + }, + { + "epoch": 2.9752910169119264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.517939567565918, + "learning_rate": 1e-06, + "loss": 1.1095, + "mean_token_accuracy": 0.6937994956970215, + "num_tokens": 701104306.0, + "step": 27093 + }, + { + "epoch": 2.9754008346145397, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4908182621002197, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7219119071960449, + "num_tokens": 701129809.0, + "step": 27094 + }, + { + "epoch": 2.9755106523171535, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.331064462661743, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7230362892150879, + "num_tokens": 701157533.0, + "step": 27095 + }, + { + "epoch": 2.9756204700197673, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.250434160232544, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7141193747520447, + "num_tokens": 701188408.0, + "step": 27096 + }, + { + "epoch": 2.9757302877223806, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.50705623626709, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.7038434743881226, + "num_tokens": 701212371.0, + "step": 27097 + }, + { + "epoch": 2.9758401054249948, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.462337017059326, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.732097327709198, + "num_tokens": 701235803.0, + "step": 27098 + }, + { + "epoch": 2.975949923127608, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.369866371154785, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7034299373626709, + "num_tokens": 701263089.0, + "step": 27099 + }, + { + "epoch": 2.976059740830222, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.456570863723755, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7048594951629639, + "num_tokens": 701288344.0, + "step": 27100 + }, + { + "epoch": 2.9761695585328356, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.462690591812134, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.707543134689331, + "num_tokens": 701314693.0, + "step": 27101 + }, + { + "epoch": 2.976279376235449, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.835843324661255, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7372535467147827, + "num_tokens": 701332861.0, + "step": 27102 + }, + { + "epoch": 2.9763891939380627, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.482828140258789, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6924877762794495, + "num_tokens": 701362650.0, + "step": 27103 + }, + { + "epoch": 2.9764990116406764, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5451126098632812, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7030012607574463, + "num_tokens": 701387886.0, + "step": 27104 + }, + { + "epoch": 2.97660882934329, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4549548625946045, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7144752144813538, + "num_tokens": 701411826.0, + "step": 27105 + }, + { + "epoch": 2.976718647045904, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3108460903167725, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7231808304786682, + "num_tokens": 701437492.0, + "step": 27106 + }, + { + "epoch": 2.9768284647485173, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.246805429458618, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7127633094787598, + "num_tokens": 701467832.0, + "step": 27107 + }, + { + "epoch": 2.976938282451131, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4410722255706787, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7101898193359375, + "num_tokens": 701495198.0, + "step": 27108 + }, + { + "epoch": 2.9770481001537448, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3336923122406006, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7058423757553101, + "num_tokens": 701523997.0, + "step": 27109 + }, + { + "epoch": 2.9771579178563585, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.515273094177246, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6947024464607239, + "num_tokens": 701549938.0, + "step": 27110 + }, + { + "epoch": 2.9772677355589723, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.400273561477661, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7085325717926025, + "num_tokens": 701576929.0, + "step": 27111 + }, + { + "epoch": 2.9773775532615856, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.051255941390991, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7122284173965454, + "num_tokens": 701594938.0, + "step": 27112 + }, + { + "epoch": 2.9774873709641994, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.45174241065979, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6988665461540222, + "num_tokens": 701620664.0, + "step": 27113 + }, + { + "epoch": 2.977597188666813, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6444079875946045, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7325799465179443, + "num_tokens": 701642530.0, + "step": 27114 + }, + { + "epoch": 2.977707006369427, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4017233848571777, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7095259428024292, + "num_tokens": 701667475.0, + "step": 27115 + }, + { + "epoch": 2.9778168240720406, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.690302848815918, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.70903480052948, + "num_tokens": 701689185.0, + "step": 27116 + }, + { + "epoch": 2.977926641774654, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2856602668762207, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7388100028038025, + "num_tokens": 701717697.0, + "step": 27117 + }, + { + "epoch": 2.9780364594772677, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.397651433944702, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7334131002426147, + "num_tokens": 701743206.0, + "step": 27118 + }, + { + "epoch": 2.9781462771798815, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.456639528274536, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7198121547698975, + "num_tokens": 701766654.0, + "step": 27119 + }, + { + "epoch": 2.978256094882495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6136865615844727, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7060463428497314, + "num_tokens": 701789536.0, + "step": 27120 + }, + { + "epoch": 2.978365912585109, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.53171443939209, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7185524106025696, + "num_tokens": 701812744.0, + "step": 27121 + }, + { + "epoch": 2.9784757302877223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4747793674468994, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7302021980285645, + "num_tokens": 701836365.0, + "step": 27122 + }, + { + "epoch": 2.978585547990336, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6301937103271484, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7196841835975647, + "num_tokens": 701857860.0, + "step": 27123 + }, + { + "epoch": 2.97869536569295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.399609088897705, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6981637477874756, + "num_tokens": 701886383.0, + "step": 27124 + }, + { + "epoch": 2.978805183395563, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.0178165435791016, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7538145780563354, + "num_tokens": 701907847.0, + "step": 27125 + }, + { + "epoch": 2.978915001098177, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4191603660583496, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7162400484085083, + "num_tokens": 701936129.0, + "step": 27126 + }, + { + "epoch": 2.9790248188007906, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5611355304718018, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7124025225639343, + "num_tokens": 701959243.0, + "step": 27127 + }, + { + "epoch": 2.9791346365034044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3532063961029053, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7204943895339966, + "num_tokens": 701985609.0, + "step": 27128 + }, + { + "epoch": 2.979244454206018, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4671056270599365, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.705268383026123, + "num_tokens": 702010856.0, + "step": 27129 + }, + { + "epoch": 2.9793542719086314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.383483409881592, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7027235627174377, + "num_tokens": 702036322.0, + "step": 27130 + }, + { + "epoch": 2.979464089611245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2653791904449463, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.72980135679245, + "num_tokens": 702064103.0, + "step": 27131 + }, + { + "epoch": 2.979573907313859, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4849438667297363, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7193708419799805, + "num_tokens": 702089732.0, + "step": 27132 + }, + { + "epoch": 2.9796837250164727, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5932366847991943, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7405292391777039, + "num_tokens": 702111620.0, + "step": 27133 + }, + { + "epoch": 2.9797935427190865, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5077927112579346, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6874933242797852, + "num_tokens": 702136971.0, + "step": 27134 + }, + { + "epoch": 2.9799033604217, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1957247257232666, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7115194797515869, + "num_tokens": 702166116.0, + "step": 27135 + }, + { + "epoch": 2.9800131781243135, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.224440097808838, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7438938617706299, + "num_tokens": 702180819.0, + "step": 27136 + }, + { + "epoch": 2.9801229958269273, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.66243052482605, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7224489450454712, + "num_tokens": 702201448.0, + "step": 27137 + }, + { + "epoch": 2.980232813529541, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.633636951446533, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7141502499580383, + "num_tokens": 702225696.0, + "step": 27138 + }, + { + "epoch": 2.980342631232155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.442418336868286, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7128008008003235, + "num_tokens": 702252138.0, + "step": 27139 + }, + { + "epoch": 2.980452448934768, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6075491905212402, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7257647514343262, + "num_tokens": 702275638.0, + "step": 27140 + }, + { + "epoch": 2.980562266637382, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6264617443084717, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7272279262542725, + "num_tokens": 702298503.0, + "step": 27141 + }, + { + "epoch": 2.9806720843399956, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.628284454345703, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7337428331375122, + "num_tokens": 702321521.0, + "step": 27142 + }, + { + "epoch": 2.9807819020426094, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6183104515075684, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7033424973487854, + "num_tokens": 702343966.0, + "step": 27143 + }, + { + "epoch": 2.980891719745223, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.648385763168335, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7037755846977234, + "num_tokens": 702366429.0, + "step": 27144 + }, + { + "epoch": 2.9810015374478365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.731537342071533, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7176975011825562, + "num_tokens": 702388325.0, + "step": 27145 + }, + { + "epoch": 2.9811113551504502, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.312619686126709, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.7012288570404053, + "num_tokens": 702417213.0, + "step": 27146 + }, + { + "epoch": 2.981221172853064, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4929237365722656, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7055231332778931, + "num_tokens": 702441237.0, + "step": 27147 + }, + { + "epoch": 2.9813309905556777, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1657509803771973, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6900447607040405, + "num_tokens": 702474699.0, + "step": 27148 + }, + { + "epoch": 2.9814408082582915, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2877936363220215, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7124229669570923, + "num_tokens": 702504887.0, + "step": 27149 + }, + { + "epoch": 2.981550625960905, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.566965341567993, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6960104703903198, + "num_tokens": 702528723.0, + "step": 27150 + }, + { + "epoch": 2.9816604436635186, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5472142696380615, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7311549782752991, + "num_tokens": 702551150.0, + "step": 27151 + }, + { + "epoch": 2.9817702613661323, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.546773910522461, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7338007688522339, + "num_tokens": 702574087.0, + "step": 27152 + }, + { + "epoch": 2.9818800790687456, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4721643924713135, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7064500451087952, + "num_tokens": 702601130.0, + "step": 27153 + }, + { + "epoch": 2.9819898967713594, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2243645191192627, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7161663770675659, + "num_tokens": 702633442.0, + "step": 27154 + }, + { + "epoch": 2.982099714473973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3876447677612305, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7241828441619873, + "num_tokens": 702659621.0, + "step": 27155 + }, + { + "epoch": 2.982209532176587, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2569520473480225, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7188700437545776, + "num_tokens": 702687776.0, + "step": 27156 + }, + { + "epoch": 2.9823193498792007, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.555384874343872, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7190355658531189, + "num_tokens": 702712274.0, + "step": 27157 + }, + { + "epoch": 2.982429167581814, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3234059810638428, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7182345986366272, + "num_tokens": 702740462.0, + "step": 27158 + }, + { + "epoch": 2.9825389852844277, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4688146114349365, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7234542369842529, + "num_tokens": 702764661.0, + "step": 27159 + }, + { + "epoch": 2.9826488029870415, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3497233390808105, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7094516754150391, + "num_tokens": 702792044.0, + "step": 27160 + }, + { + "epoch": 2.9827586206896552, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.53627872467041, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7187976837158203, + "num_tokens": 702813905.0, + "step": 27161 + }, + { + "epoch": 2.982868438392269, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6737256050109863, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7159836292266846, + "num_tokens": 702834398.0, + "step": 27162 + }, + { + "epoch": 2.9829782560948823, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.496100902557373, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.7020283937454224, + "num_tokens": 702860805.0, + "step": 27163 + }, + { + "epoch": 2.983088073797496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.558445692062378, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.717843770980835, + "num_tokens": 702887110.0, + "step": 27164 + }, + { + "epoch": 2.98319789150011, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 4.148090362548828, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7045327425003052, + "num_tokens": 702914694.0, + "step": 27165 + }, + { + "epoch": 2.9833077092027236, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.381049633026123, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7001557946205139, + "num_tokens": 702941666.0, + "step": 27166 + }, + { + "epoch": 2.9834175269053373, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.905278444290161, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7224438190460205, + "num_tokens": 702962865.0, + "step": 27167 + }, + { + "epoch": 2.9835273446079507, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2542378902435303, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7283068299293518, + "num_tokens": 702991618.0, + "step": 27168 + }, + { + "epoch": 2.9836371623105644, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3232619762420654, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6940335631370544, + "num_tokens": 703023298.0, + "step": 27169 + }, + { + "epoch": 2.983746980013178, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.231886625289917, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7061454653739929, + "num_tokens": 703053813.0, + "step": 27170 + }, + { + "epoch": 2.983856797715792, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.962843894958496, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.732717752456665, + "num_tokens": 703072059.0, + "step": 27171 + }, + { + "epoch": 2.9839666154184057, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6111884117126465, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7035666108131409, + "num_tokens": 703096897.0, + "step": 27172 + }, + { + "epoch": 2.984076433121019, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.769217014312744, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7096495628356934, + "num_tokens": 703117645.0, + "step": 27173 + }, + { + "epoch": 2.9841862508236328, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.379822015762329, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.742738664150238, + "num_tokens": 703139908.0, + "step": 27174 + }, + { + "epoch": 2.9842960685262465, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.369312047958374, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7352956533432007, + "num_tokens": 703163662.0, + "step": 27175 + }, + { + "epoch": 2.98440588622886, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3868513107299805, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7231083512306213, + "num_tokens": 703188222.0, + "step": 27176 + }, + { + "epoch": 2.984515703931474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4420602321624756, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7168143391609192, + "num_tokens": 703213448.0, + "step": 27177 + }, + { + "epoch": 2.9846255216340873, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3781967163085938, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7194647192955017, + "num_tokens": 703239870.0, + "step": 27178 + }, + { + "epoch": 2.984735339336701, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5277702808380127, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7137670516967773, + "num_tokens": 703262075.0, + "step": 27179 + }, + { + "epoch": 2.984845157039315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4831607341766357, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7117045521736145, + "num_tokens": 703286704.0, + "step": 27180 + }, + { + "epoch": 2.984954974741928, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3453774452209473, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6932422518730164, + "num_tokens": 703315082.0, + "step": 27181 + }, + { + "epoch": 2.985064792444542, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3860726356506348, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7251750230789185, + "num_tokens": 703342324.0, + "step": 27182 + }, + { + "epoch": 2.9851746101471557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2042160034179688, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7126374244689941, + "num_tokens": 703373898.0, + "step": 27183 + }, + { + "epoch": 2.9852844278497694, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.736910581588745, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7247200608253479, + "num_tokens": 703394399.0, + "step": 27184 + }, + { + "epoch": 2.985394245552383, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6523866653442383, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7080612778663635, + "num_tokens": 703418105.0, + "step": 27185 + }, + { + "epoch": 2.9855040632549965, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3059191703796387, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7038276195526123, + "num_tokens": 703445569.0, + "step": 27186 + }, + { + "epoch": 2.9856138809576103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7459089756011963, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7077900171279907, + "num_tokens": 703466434.0, + "step": 27187 + }, + { + "epoch": 2.985723698660224, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414323329925537, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7181938886642456, + "num_tokens": 703493661.0, + "step": 27188 + }, + { + "epoch": 2.985833516362838, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.68328595161438, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7170429229736328, + "num_tokens": 703515129.0, + "step": 27189 + }, + { + "epoch": 2.9859433340654515, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.970992088317871, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7274824380874634, + "num_tokens": 703531781.0, + "step": 27190 + }, + { + "epoch": 2.986053151768065, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1532487869262695, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6985217332839966, + "num_tokens": 703564640.0, + "step": 27191 + }, + { + "epoch": 2.9861629694706786, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3809452056884766, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7156424522399902, + "num_tokens": 703590692.0, + "step": 27192 + }, + { + "epoch": 2.9862727871732924, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5820348262786865, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7106218338012695, + "num_tokens": 703614104.0, + "step": 27193 + }, + { + "epoch": 2.986382604875906, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3035480976104736, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7175707221031189, + "num_tokens": 703643107.0, + "step": 27194 + }, + { + "epoch": 2.98649242257852, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2610561847686768, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6916196346282959, + "num_tokens": 703675675.0, + "step": 27195 + }, + { + "epoch": 2.986602240281133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2557148933410645, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7099340558052063, + "num_tokens": 703705056.0, + "step": 27196 + }, + { + "epoch": 2.986712057983747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.480527877807617, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7415972948074341, + "num_tokens": 703728771.0, + "step": 27197 + }, + { + "epoch": 2.9868218756863607, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.180561065673828, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.716448962688446, + "num_tokens": 703757949.0, + "step": 27198 + }, + { + "epoch": 2.9869316933889745, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.530264377593994, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7237716317176819, + "num_tokens": 703781668.0, + "step": 27199 + }, + { + "epoch": 2.987041511091588, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.324838399887085, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7244025468826294, + "num_tokens": 703810232.0, + "step": 27200 + }, + { + "epoch": 2.9871513287942015, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.228592872619629, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7190374732017517, + "num_tokens": 703837505.0, + "step": 27201 + }, + { + "epoch": 2.9872611464968153, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.385115385055542, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7072999477386475, + "num_tokens": 703864601.0, + "step": 27202 + }, + { + "epoch": 2.987370964199429, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 3.722661018371582, + "learning_rate": 1e-06, + "loss": 1.0935, + "mean_token_accuracy": 0.6809717416763306, + "num_tokens": 703896134.0, + "step": 27203 + }, + { + "epoch": 2.9874807819020424, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7924487590789795, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7247705459594727, + "num_tokens": 703916331.0, + "step": 27204 + }, + { + "epoch": 2.987590599604656, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.337170124053955, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7121908664703369, + "num_tokens": 703944317.0, + "step": 27205 + }, + { + "epoch": 2.98770041730727, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7012939453125, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7208978533744812, + "num_tokens": 703965788.0, + "step": 27206 + }, + { + "epoch": 2.9878102350098836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4232048988342285, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7166619896888733, + "num_tokens": 703991689.0, + "step": 27207 + }, + { + "epoch": 2.9879200527124974, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4407198429107666, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7238464951515198, + "num_tokens": 704018020.0, + "step": 27208 + }, + { + "epoch": 2.9880298704151107, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5829875469207764, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7098420858383179, + "num_tokens": 704040366.0, + "step": 27209 + }, + { + "epoch": 2.9881396881177245, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4200422763824463, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7145426273345947, + "num_tokens": 704064833.0, + "step": 27210 + }, + { + "epoch": 2.988249505820338, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.675893783569336, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7254617810249329, + "num_tokens": 704087321.0, + "step": 27211 + }, + { + "epoch": 2.988359323522952, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.317258358001709, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6891148090362549, + "num_tokens": 704115895.0, + "step": 27212 + }, + { + "epoch": 2.9884691412255657, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.142946481704712, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7221784591674805, + "num_tokens": 704148074.0, + "step": 27213 + }, + { + "epoch": 2.988578958928179, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4203832149505615, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7081683278083801, + "num_tokens": 704175384.0, + "step": 27214 + }, + { + "epoch": 2.988688776630793, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4889440536499023, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7251051068305969, + "num_tokens": 704198495.0, + "step": 27215 + }, + { + "epoch": 2.9887985943334066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.259075403213501, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7012754678726196, + "num_tokens": 704226538.0, + "step": 27216 + }, + { + "epoch": 2.9889084120360203, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.615288734436035, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7328477501869202, + "num_tokens": 704248083.0, + "step": 27217 + }, + { + "epoch": 2.989018229738634, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.751305103302002, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7363238334655762, + "num_tokens": 704267508.0, + "step": 27218 + }, + { + "epoch": 2.9891280474412474, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.305757999420166, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.693534791469574, + "num_tokens": 704298993.0, + "step": 27219 + }, + { + "epoch": 2.989237865143861, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.426868200302124, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7280160784721375, + "num_tokens": 704323108.0, + "step": 27220 + }, + { + "epoch": 2.989347682846475, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.759164810180664, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7165504693984985, + "num_tokens": 704343373.0, + "step": 27221 + }, + { + "epoch": 2.9894575005490887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5359814167022705, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.707349956035614, + "num_tokens": 704366538.0, + "step": 27222 + }, + { + "epoch": 2.9895673182517024, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7323577404022217, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7081792950630188, + "num_tokens": 704390386.0, + "step": 27223 + }, + { + "epoch": 2.9896771359543157, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4448273181915283, + "learning_rate": 1e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.7001791000366211, + "num_tokens": 704415032.0, + "step": 27224 + }, + { + "epoch": 2.9897869536569295, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2648091316223145, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7329617738723755, + "num_tokens": 704442716.0, + "step": 27225 + }, + { + "epoch": 2.9898967713595432, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.443953037261963, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7255281209945679, + "num_tokens": 704467676.0, + "step": 27226 + }, + { + "epoch": 2.9900065890621566, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.646400213241577, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7227949500083923, + "num_tokens": 704488473.0, + "step": 27227 + }, + { + "epoch": 2.9901164067647708, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.093210458755493, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7094924449920654, + "num_tokens": 704520933.0, + "step": 27228 + }, + { + "epoch": 2.990226224467384, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1809496879577637, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.72319495677948, + "num_tokens": 704550513.0, + "step": 27229 + }, + { + "epoch": 2.990336042169998, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3801965713500977, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6925243139266968, + "num_tokens": 704577609.0, + "step": 27230 + }, + { + "epoch": 2.9904458598726116, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4697365760803223, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7174414396286011, + "num_tokens": 704601933.0, + "step": 27231 + }, + { + "epoch": 2.990555677575225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5927014350891113, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7155451774597168, + "num_tokens": 704624988.0, + "step": 27232 + }, + { + "epoch": 2.9906654952778386, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.247619867324829, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7137353420257568, + "num_tokens": 704653433.0, + "step": 27233 + }, + { + "epoch": 2.9907753129804524, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2363314628601074, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.717012882232666, + "num_tokens": 704681291.0, + "step": 27234 + }, + { + "epoch": 2.990885130683066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.551722288131714, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7283931374549866, + "num_tokens": 704704250.0, + "step": 27235 + }, + { + "epoch": 2.99099494838568, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6332902908325195, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7326567769050598, + "num_tokens": 704729508.0, + "step": 27236 + }, + { + "epoch": 2.9911047660882932, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5515847206115723, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.718029797077179, + "num_tokens": 704753815.0, + "step": 27237 + }, + { + "epoch": 2.991214583790907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2750401496887207, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6891618371009827, + "num_tokens": 704785221.0, + "step": 27238 + }, + { + "epoch": 2.9913244014935207, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.242096185684204, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.7087177038192749, + "num_tokens": 704815571.0, + "step": 27239 + }, + { + "epoch": 2.9914342191961345, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.404008626937866, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7131052017211914, + "num_tokens": 704841008.0, + "step": 27240 + }, + { + "epoch": 2.9915440368987483, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3743395805358887, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.718754768371582, + "num_tokens": 704867545.0, + "step": 27241 + }, + { + "epoch": 2.9916538546013616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5826094150543213, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7153971195220947, + "num_tokens": 704891252.0, + "step": 27242 + }, + { + "epoch": 2.9917636723039753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6924843788146973, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7217552065849304, + "num_tokens": 704910961.0, + "step": 27243 + }, + { + "epoch": 2.991873490006589, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1501126289367676, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6950410008430481, + "num_tokens": 704943947.0, + "step": 27244 + }, + { + "epoch": 2.991983307709203, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3276114463806152, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7125301361083984, + "num_tokens": 704974192.0, + "step": 27245 + }, + { + "epoch": 2.9920931254118166, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5625452995300293, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.729031503200531, + "num_tokens": 704995669.0, + "step": 27246 + }, + { + "epoch": 2.99220294311443, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4612324237823486, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.721656322479248, + "num_tokens": 705020032.0, + "step": 27247 + }, + { + "epoch": 2.9923127608170437, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.299508810043335, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6972545385360718, + "num_tokens": 705048849.0, + "step": 27248 + }, + { + "epoch": 2.9924225785196574, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2703871726989746, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7211424112319946, + "num_tokens": 705077489.0, + "step": 27249 + }, + { + "epoch": 2.992532396222271, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2814459800720215, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7248010039329529, + "num_tokens": 705105991.0, + "step": 27250 + }, + { + "epoch": 2.992642213924885, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7431840896606445, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7206542491912842, + "num_tokens": 705126974.0, + "step": 27251 + }, + { + "epoch": 2.9927520316274983, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3657450675964355, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6914645433425903, + "num_tokens": 705151444.0, + "step": 27252 + }, + { + "epoch": 2.992861849330112, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6719298362731934, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7286463975906372, + "num_tokens": 705172830.0, + "step": 27253 + }, + { + "epoch": 2.9929716670327258, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.31923508644104, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7119609713554382, + "num_tokens": 705202731.0, + "step": 27254 + }, + { + "epoch": 2.993081484735339, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.8193376064300537, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7163692116737366, + "num_tokens": 705223406.0, + "step": 27255 + }, + { + "epoch": 2.993191302437953, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 6.840522289276123, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.702745795249939, + "num_tokens": 705252641.0, + "step": 27256 + }, + { + "epoch": 2.9933011201405666, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2222719192504883, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7094419002532959, + "num_tokens": 705283871.0, + "step": 27257 + }, + { + "epoch": 2.9934109378431804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4229483604431152, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7209591865539551, + "num_tokens": 705308264.0, + "step": 27258 + }, + { + "epoch": 2.993520755545794, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.572317123413086, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7274519205093384, + "num_tokens": 705331376.0, + "step": 27259 + }, + { + "epoch": 2.9936305732484074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.462265968322754, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6884372234344482, + "num_tokens": 705359289.0, + "step": 27260 + }, + { + "epoch": 2.993740390951021, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.362422466278076, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7017520666122437, + "num_tokens": 705386707.0, + "step": 27261 + }, + { + "epoch": 2.993850208653635, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1440086364746094, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7252397537231445, + "num_tokens": 705418513.0, + "step": 27262 + }, + { + "epoch": 2.9939600263562487, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.175150156021118, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7313743233680725, + "num_tokens": 705447497.0, + "step": 27263 + }, + { + "epoch": 2.9940698440588625, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.325411319732666, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.714262843132019, + "num_tokens": 705474844.0, + "step": 27264 + }, + { + "epoch": 2.9941796617614758, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.621434211730957, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7403292655944824, + "num_tokens": 705496732.0, + "step": 27265 + }, + { + "epoch": 2.9942894794640895, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.494192123413086, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7217279672622681, + "num_tokens": 705519700.0, + "step": 27266 + }, + { + "epoch": 2.9943992971667033, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.352442741394043, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7246406674385071, + "num_tokens": 705546443.0, + "step": 27267 + }, + { + "epoch": 2.994509114869317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.513176441192627, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7203620076179504, + "num_tokens": 705569904.0, + "step": 27268 + }, + { + "epoch": 2.994618932571931, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.613091230392456, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.722984254360199, + "num_tokens": 705593737.0, + "step": 27269 + }, + { + "epoch": 2.994728750274544, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4371602535247803, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7151483297348022, + "num_tokens": 705620490.0, + "step": 27270 + }, + { + "epoch": 2.994838567977158, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.641519784927368, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7303785085678101, + "num_tokens": 705641312.0, + "step": 27271 + }, + { + "epoch": 2.9949483856797716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.428776741027832, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.728418231010437, + "num_tokens": 705668248.0, + "step": 27272 + }, + { + "epoch": 2.9950582033823854, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.355236053466797, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7080231308937073, + "num_tokens": 705696773.0, + "step": 27273 + }, + { + "epoch": 2.995168021084999, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2150633335113525, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7032161355018616, + "num_tokens": 705728765.0, + "step": 27274 + }, + { + "epoch": 2.9952778387876124, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3668148517608643, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6969574689865112, + "num_tokens": 705755618.0, + "step": 27275 + }, + { + "epoch": 2.995387656490226, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3452303409576416, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7088217735290527, + "num_tokens": 705784998.0, + "step": 27276 + }, + { + "epoch": 2.99549747419284, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5157248973846436, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7472749948501587, + "num_tokens": 705807142.0, + "step": 27277 + }, + { + "epoch": 2.9956072918954533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3775622844696045, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7108363509178162, + "num_tokens": 705834156.0, + "step": 27278 + }, + { + "epoch": 2.9957171095980675, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4491772651672363, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.714439868927002, + "num_tokens": 705858564.0, + "step": 27279 + }, + { + "epoch": 2.995826927300681, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3088200092315674, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.7031145691871643, + "num_tokens": 705888824.0, + "step": 27280 + }, + { + "epoch": 2.9959367450032945, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5076537132263184, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7193991541862488, + "num_tokens": 705912030.0, + "step": 27281 + }, + { + "epoch": 2.9960465627059083, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.467855930328369, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7087821960449219, + "num_tokens": 705937277.0, + "step": 27282 + }, + { + "epoch": 2.9961563804085216, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6818809509277344, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7275968790054321, + "num_tokens": 705959095.0, + "step": 27283 + }, + { + "epoch": 2.9962661981111354, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6817593574523926, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7289092540740967, + "num_tokens": 705980298.0, + "step": 27284 + }, + { + "epoch": 2.996376015813749, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5573890209198, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.740204930305481, + "num_tokens": 706002573.0, + "step": 27285 + }, + { + "epoch": 2.996485833516363, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6057708263397217, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.709003210067749, + "num_tokens": 706024834.0, + "step": 27286 + }, + { + "epoch": 2.9965956512189766, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4794561862945557, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6977444887161255, + "num_tokens": 706050593.0, + "step": 27287 + }, + { + "epoch": 2.99670546892159, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.414870023727417, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6999714374542236, + "num_tokens": 706076878.0, + "step": 27288 + }, + { + "epoch": 2.9968152866242037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5527608394622803, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7417076826095581, + "num_tokens": 706098592.0, + "step": 27289 + }, + { + "epoch": 2.9969251043268175, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4579856395721436, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7361057996749878, + "num_tokens": 706122101.0, + "step": 27290 + }, + { + "epoch": 2.9970349220294312, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.24881649017334, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7159247398376465, + "num_tokens": 706150293.0, + "step": 27291 + }, + { + "epoch": 2.997144739732045, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5885305404663086, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7391705513000488, + "num_tokens": 706172552.0, + "step": 27292 + }, + { + "epoch": 2.9972545574346583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2111408710479736, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7124636173248291, + "num_tokens": 706202391.0, + "step": 27293 + }, + { + "epoch": 2.997364375137272, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6232404708862305, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7023011445999146, + "num_tokens": 706226644.0, + "step": 27294 + }, + { + "epoch": 2.997474192839886, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.7559125423431396, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.730293333530426, + "num_tokens": 706245912.0, + "step": 27295 + }, + { + "epoch": 2.9975840105424996, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.6030585765838623, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7170681953430176, + "num_tokens": 706268033.0, + "step": 27296 + }, + { + "epoch": 2.9976938282451133, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.454801321029663, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.73921799659729, + "num_tokens": 706290252.0, + "step": 27297 + }, + { + "epoch": 2.9978036459477266, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.21586012840271, + "learning_rate": 1e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.6953673362731934, + "num_tokens": 706320660.0, + "step": 27298 + }, + { + "epoch": 2.9979134636503404, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.5946927070617676, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7111530303955078, + "num_tokens": 706342544.0, + "step": 27299 + }, + { + "epoch": 2.998023281352954, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.383096933364868, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7263972163200378, + "num_tokens": 706369294.0, + "step": 27300 + }, + { + "epoch": 2.998133099055568, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4634218215942383, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7534329295158386, + "num_tokens": 706392264.0, + "step": 27301 + }, + { + "epoch": 2.9982429167581817, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.207463026046753, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7060483694076538, + "num_tokens": 706423052.0, + "step": 27302 + }, + { + "epoch": 2.998352734460795, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3519203662872314, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7372137308120728, + "num_tokens": 706448674.0, + "step": 27303 + }, + { + "epoch": 2.9984625521634087, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.402649402618408, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7196491360664368, + "num_tokens": 706475897.0, + "step": 27304 + }, + { + "epoch": 2.9985723698660225, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1175975799560547, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7273063659667969, + "num_tokens": 706508259.0, + "step": 27305 + }, + { + "epoch": 2.998682187568636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4760682582855225, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.716056764125824, + "num_tokens": 706531279.0, + "step": 27306 + }, + { + "epoch": 2.9987920052712496, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.327951431274414, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7049366235733032, + "num_tokens": 706557018.0, + "step": 27307 + }, + { + "epoch": 2.9989018229738633, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.3292653560638428, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.7082880735397339, + "num_tokens": 706587542.0, + "step": 27308 + }, + { + "epoch": 2.999011640676477, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.912743330001831, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7195872068405151, + "num_tokens": 706606653.0, + "step": 27309 + }, + { + "epoch": 2.999121458379091, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4016242027282715, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7274194955825806, + "num_tokens": 706631700.0, + "step": 27310 + }, + { + "epoch": 2.999231276081704, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.442821502685547, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7237918972969055, + "num_tokens": 706656969.0, + "step": 27311 + }, + { + "epoch": 2.999341093784318, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.539532423019409, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7121085524559021, + "num_tokens": 706681011.0, + "step": 27312 + }, + { + "epoch": 2.9994509114869317, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.267766237258911, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7192548513412476, + "num_tokens": 706707342.0, + "step": 27313 + }, + { + "epoch": 2.9995607291895454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.4406142234802246, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.72212153673172, + "num_tokens": 706731699.0, + "step": 27314 + }, + { + "epoch": 2.999670546892159, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.427917718887329, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7120981216430664, + "num_tokens": 706757635.0, + "step": 27315 + }, + { + "epoch": 2.9997803645947725, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1784491539001465, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7308921813964844, + "num_tokens": 706787813.0, + "step": 27316 + }, + { + "epoch": 2.9998901822973862, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.2743237018585205, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6868342757225037, + "num_tokens": 706821902.0, + "step": 27317 + }, + { + "epoch": 3.0, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 2.1466317176818848, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7184498310089111, + "num_tokens": 706854136.0, + "step": 27318 + }, + { + "epoch": 3.0, + "ewc_loss": 2.2411346435546875e-05, + "step": 27318, + "total_flos": 3.1829322264618205e+19, + "train_loss": 0.9959457731411026, + "train_runtime": 37043.5107, + "train_samples_per_second": 11.799, + "train_steps_per_second": 0.737 + } + ], + "logging_steps": 1, + "max_steps": 27318, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 13659, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.1829322264618205e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c1ec0ce --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2239c666d1769455d18334cee8e0853e08a8013a0f9f7272dd2ff6168c49bd1b +size 13393